Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
weixin_41840029
PaddleOCR
提交
eb7ce442
P
PaddleOCR
项目概览
weixin_41840029
/
PaddleOCR
与 Fork 源项目一致
Fork自
PaddlePaddle / PaddleOCR
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
eb7ce442
编写于
6月 03, 2021
作者:
W
WenmuZhou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add table eval and predict script
上级
79436248
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
247 addition
and
0 deletion
+247
-0
ppstructure/table/matcher.py
ppstructure/table/matcher.py
+0
-0
ppstructure/table/table_metric/table_metric.py
ppstructure/table/table_metric/table_metric.py
+247
-0
未找到文件。
pp
ocr/utils/table_utils
/matcher.py
→
pp
structure/table
/matcher.py
浏览文件 @
eb7ce442
文件已移动
ppstructure/table/table_metric/table_metric.py
0 → 100755
浏览文件 @
eb7ce442
# Copyright 2020 IBM
# Author: peter.zhong@au1.ibm.com
#
# This is free software; you can redistribute it and/or modify
# it under the terms of the Apache 2.0 License.
#
# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Apache 2.0 License for more details.
import
distance
from
apted
import
APTED
,
Config
from
apted.helpers
import
Tree
from
lxml
import
etree
,
html
from
collections
import
deque
from
.parallel
import
parallel_process
from
tqdm
import
tqdm
class
TableTree
(
Tree
):
def
__init__
(
self
,
tag
,
colspan
=
None
,
rowspan
=
None
,
content
=
None
,
*
children
):
self
.
tag
=
tag
self
.
colspan
=
colspan
self
.
rowspan
=
rowspan
self
.
content
=
content
self
.
children
=
list
(
children
)
def
bracket
(
self
):
"""Show tree using brackets notation"""
if
self
.
tag
==
'td'
:
result
=
'"tag": %s, "colspan": %d, "rowspan": %d, "text": %s'
%
\
(
self
.
tag
,
self
.
colspan
,
self
.
rowspan
,
self
.
content
)
else
:
result
=
'"tag": %s'
%
self
.
tag
for
child
in
self
.
children
:
result
+=
child
.
bracket
()
return
"{{{}}}"
.
format
(
result
)
class
CustomConfig
(
Config
):
@
staticmethod
def
maximum
(
*
sequences
):
"""Get maximum possible value
"""
return
max
(
map
(
len
,
sequences
))
def
normalized_distance
(
self
,
*
sequences
):
"""Get distance from 0 to 1
"""
return
float
(
distance
.
levenshtein
(
*
sequences
))
/
self
.
maximum
(
*
sequences
)
def
rename
(
self
,
node1
,
node2
):
"""Compares attributes of trees"""
#print(node1.tag)
if
(
node1
.
tag
!=
node2
.
tag
)
or
(
node1
.
colspan
!=
node2
.
colspan
)
or
(
node1
.
rowspan
!=
node2
.
rowspan
):
return
1.
if
node1
.
tag
==
'td'
:
if
node1
.
content
or
node2
.
content
:
#print(node1.content, )
return
self
.
normalized_distance
(
node1
.
content
,
node2
.
content
)
return
0.
class
CustomConfig_del_short
(
Config
):
@
staticmethod
def
maximum
(
*
sequences
):
"""Get maximum possible value
"""
return
max
(
map
(
len
,
sequences
))
def
normalized_distance
(
self
,
*
sequences
):
"""Get distance from 0 to 1
"""
return
float
(
distance
.
levenshtein
(
*
sequences
))
/
self
.
maximum
(
*
sequences
)
def
rename
(
self
,
node1
,
node2
):
"""Compares attributes of trees"""
if
(
node1
.
tag
!=
node2
.
tag
)
or
(
node1
.
colspan
!=
node2
.
colspan
)
or
(
node1
.
rowspan
!=
node2
.
rowspan
):
return
1.
if
node1
.
tag
==
'td'
:
if
node1
.
content
or
node2
.
content
:
#print('before')
#print(node1.content, node2.content)
#print('after')
node1_content
=
node1
.
content
node2_content
=
node2
.
content
if
len
(
node1_content
)
<
3
:
node1_content
=
[
'####'
]
if
len
(
node2_content
)
<
3
:
node2_content
=
[
'####'
]
return
self
.
normalized_distance
(
node1_content
,
node2_content
)
return
0.
class
CustomConfig_del_block
(
Config
):
@
staticmethod
def
maximum
(
*
sequences
):
"""Get maximum possible value
"""
return
max
(
map
(
len
,
sequences
))
def
normalized_distance
(
self
,
*
sequences
):
"""Get distance from 0 to 1
"""
return
float
(
distance
.
levenshtein
(
*
sequences
))
/
self
.
maximum
(
*
sequences
)
def
rename
(
self
,
node1
,
node2
):
"""Compares attributes of trees"""
if
(
node1
.
tag
!=
node2
.
tag
)
or
(
node1
.
colspan
!=
node2
.
colspan
)
or
(
node1
.
rowspan
!=
node2
.
rowspan
):
return
1.
if
node1
.
tag
==
'td'
:
if
node1
.
content
or
node2
.
content
:
node1_content
=
node1
.
content
node2_content
=
node2
.
content
while
' '
in
node1_content
:
print
(
node1_content
.
index
(
' '
))
node1_content
.
pop
(
node1_content
.
index
(
' '
))
while
' '
in
node2_content
:
print
(
node2_content
.
index
(
' '
))
node2_content
.
pop
(
node2_content
.
index
(
' '
))
return
self
.
normalized_distance
(
node1_content
,
node2_content
)
return
0.
class
TEDS
(
object
):
''' Tree Edit Distance basead Similarity
'''
def
__init__
(
self
,
structure_only
=
False
,
n_jobs
=
1
,
ignore_nodes
=
None
):
assert
isinstance
(
n_jobs
,
int
)
and
(
n_jobs
>=
1
),
'n_jobs must be an integer greather than 1'
self
.
structure_only
=
structure_only
self
.
n_jobs
=
n_jobs
self
.
ignore_nodes
=
ignore_nodes
self
.
__tokens__
=
[]
def
tokenize
(
self
,
node
):
''' Tokenizes table cells
'''
self
.
__tokens__
.
append
(
'<%s>'
%
node
.
tag
)
if
node
.
text
is
not
None
:
self
.
__tokens__
+=
list
(
node
.
text
)
for
n
in
node
.
getchildren
():
self
.
tokenize
(
n
)
if
node
.
tag
!=
'unk'
:
self
.
__tokens__
.
append
(
'</%s>'
%
node
.
tag
)
if
node
.
tag
!=
'td'
and
node
.
tail
is
not
None
:
self
.
__tokens__
+=
list
(
node
.
tail
)
def
load_html_tree
(
self
,
node
,
parent
=
None
):
''' Converts HTML tree to the format required by apted
'''
global
__tokens__
if
node
.
tag
==
'td'
:
if
self
.
structure_only
:
cell
=
[]
else
:
self
.
__tokens__
=
[]
self
.
tokenize
(
node
)
cell
=
self
.
__tokens__
[
1
:
-
1
].
copy
()
new_node
=
TableTree
(
node
.
tag
,
int
(
node
.
attrib
.
get
(
'colspan'
,
'1'
)),
int
(
node
.
attrib
.
get
(
'rowspan'
,
'1'
)),
cell
,
*
deque
())
else
:
new_node
=
TableTree
(
node
.
tag
,
None
,
None
,
None
,
*
deque
())
if
parent
is
not
None
:
parent
.
children
.
append
(
new_node
)
if
node
.
tag
!=
'td'
:
for
n
in
node
.
getchildren
():
self
.
load_html_tree
(
n
,
new_node
)
if
parent
is
None
:
return
new_node
def
evaluate
(
self
,
pred
,
true
):
''' Computes TEDS score between the prediction and the ground truth of a
given sample
'''
if
(
not
pred
)
or
(
not
true
):
return
0.0
parser
=
html
.
HTMLParser
(
remove_comments
=
True
,
encoding
=
'utf-8'
)
pred
=
html
.
fromstring
(
pred
,
parser
=
parser
)
true
=
html
.
fromstring
(
true
,
parser
=
parser
)
if
pred
.
xpath
(
'body/table'
)
and
true
.
xpath
(
'body/table'
):
pred
=
pred
.
xpath
(
'body/table'
)[
0
]
true
=
true
.
xpath
(
'body/table'
)[
0
]
if
self
.
ignore_nodes
:
etree
.
strip_tags
(
pred
,
*
self
.
ignore_nodes
)
etree
.
strip_tags
(
true
,
*
self
.
ignore_nodes
)
n_nodes_pred
=
len
(
pred
.
xpath
(
".//*"
))
n_nodes_true
=
len
(
true
.
xpath
(
".//*"
))
n_nodes
=
max
(
n_nodes_pred
,
n_nodes_true
)
tree_pred
=
self
.
load_html_tree
(
pred
)
tree_true
=
self
.
load_html_tree
(
true
)
distance
=
APTED
(
tree_pred
,
tree_true
,
CustomConfig
()).
compute_edit_distance
()
return
1.0
-
(
float
(
distance
)
/
n_nodes
)
else
:
return
0.0
def
batch_evaluate
(
self
,
pred_json
,
true_json
):
''' Computes TEDS score between the prediction and the ground truth of
a batch of samples
@params pred_json: {'FILENAME': 'HTML CODE', ...}
@params true_json: {'FILENAME': {'html': 'HTML CODE'}, ...}
@output: {'FILENAME': 'TEDS SCORE', ...}
'''
samples
=
true_json
.
keys
()
if
self
.
n_jobs
==
1
:
scores
=
[
self
.
evaluate
(
pred_json
.
get
(
filename
,
''
),
true_json
[
filename
][
'html'
])
for
filename
in
tqdm
(
samples
)]
else
:
inputs
=
[{
'pred'
:
pred_json
.
get
(
filename
,
''
),
'true'
:
true_json
[
filename
][
'html'
]}
for
filename
in
samples
]
scores
=
parallel_process
(
inputs
,
self
.
evaluate
,
use_kwargs
=
True
,
n_jobs
=
self
.
n_jobs
,
front_num
=
1
)
scores
=
dict
(
zip
(
samples
,
scores
))
return
scores
def
batch_evaluate_html
(
self
,
pred_htmls
,
true_htmls
):
''' Computes TEDS score between the prediction and the ground truth of
a batch of samples
'''
if
self
.
n_jobs
==
1
:
scores
=
[
self
.
evaluate
(
pred_html
,
true_html
)
for
(
pred_html
,
true_html
)
in
zip
(
pred_htmls
,
true_htmls
)]
else
:
inputs
=
[{
"pred"
:
pred_html
,
"true"
:
true_html
}
for
(
pred_html
,
true_html
)
in
zip
(
pred_htmls
,
true_htmls
)]
scores
=
parallel_process
(
inputs
,
self
.
evaluate
,
use_kwargs
=
True
,
n_jobs
=
self
.
n_jobs
,
front_num
=
1
)
return
scores
if
__name__
==
'__main__'
:
import
json
import
pprint
with
open
(
'sample_pred.json'
)
as
fp
:
pred_json
=
json
.
load
(
fp
)
with
open
(
'sample_gt.json'
)
as
fp
:
true_json
=
json
.
load
(
fp
)
teds
=
TEDS
(
n_jobs
=
4
)
scores
=
teds
.
batch_evaluate
(
pred_json
,
true_json
)
pp
=
pprint
.
PrettyPrinter
()
pp
.
pprint
(
scores
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录