Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
cf1a395e
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
cf1a395e
编写于
4月 15, 2022
作者:
H
Hui Zhang
提交者:
GitHub
4月 15, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1709 from zh794390558/wer
[asr]add wer tools
上级
91e255ce
520673df
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
501 addition
and
0 deletion
+501
-0
utils/compute-wer.py
utils/compute-wer.py
+501
-0
未找到文件。
utils/compute-wer.py
0 → 100755
浏览文件 @
cf1a395e
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# CopyRight WeNet Apache-2.0 License
import
re
,
sys
,
unicodedata
import
codecs
remove_tag
=
True
spacelist
=
[
' '
,
'
\t
'
,
'
\r
'
,
'
\n
'
]
puncts
=
[
'!'
,
','
,
'?'
,
'、'
,
'。'
,
'!'
,
','
,
';'
,
'?'
,
':'
,
'「'
,
'」'
,
'︰'
,
'『'
,
'』'
,
'《'
,
'》'
]
def
characterize
(
string
)
:
res
=
[]
i
=
0
while
i
<
len
(
string
):
char
=
string
[
i
]
if
char
in
puncts
:
i
+=
1
continue
cat1
=
unicodedata
.
category
(
char
)
#https://unicodebook.readthedocs.io/unicode.html#unicode-categories
if
cat1
==
'Zs'
or
cat1
==
'Cn'
or
char
in
spacelist
:
# space or not assigned
i
+=
1
continue
if
cat1
==
'Lo'
:
# letter-other
res
.
append
(
char
)
i
+=
1
else
:
# some input looks like: <unk><noise>, we want to separate it to two words.
sep
=
' '
if
char
==
'<'
:
sep
=
'>'
j
=
i
+
1
while
j
<
len
(
string
):
c
=
string
[
j
]
if
ord
(
c
)
>=
128
or
(
c
in
spacelist
)
or
(
c
==
sep
):
break
j
+=
1
if
j
<
len
(
string
)
and
string
[
j
]
==
'>'
:
j
+=
1
res
.
append
(
string
[
i
:
j
])
i
=
j
return
res
def
stripoff_tags
(
x
):
if
not
x
:
return
''
chars
=
[]
i
=
0
;
T
=
len
(
x
)
while
i
<
T
:
if
x
[
i
]
==
'<'
:
while
i
<
T
and
x
[
i
]
!=
'>'
:
i
+=
1
i
+=
1
else
:
chars
.
append
(
x
[
i
])
i
+=
1
return
''
.
join
(
chars
)
def
normalize
(
sentence
,
ignore_words
,
cs
,
split
=
None
):
""" sentence, ignore_words are both in unicode
"""
new_sentence
=
[]
for
token
in
sentence
:
x
=
token
if
not
cs
:
x
=
x
.
upper
()
if
x
in
ignore_words
:
continue
if
remove_tag
:
x
=
stripoff_tags
(
x
)
if
not
x
:
continue
if
split
and
x
in
split
:
new_sentence
+=
split
[
x
]
else
:
new_sentence
.
append
(
x
)
return
new_sentence
class
Calculator
:
def
__init__
(
self
)
:
self
.
data
=
{}
self
.
space
=
[]
self
.
cost
=
{}
self
.
cost
[
'cor'
]
=
0
self
.
cost
[
'sub'
]
=
1
self
.
cost
[
'del'
]
=
1
self
.
cost
[
'ins'
]
=
1
def
calculate
(
self
,
lab
,
rec
)
:
# Initialization
lab
.
insert
(
0
,
''
)
rec
.
insert
(
0
,
''
)
while
len
(
self
.
space
)
<
len
(
lab
)
:
self
.
space
.
append
([])
for
row
in
self
.
space
:
for
element
in
row
:
element
[
'dist'
]
=
0
element
[
'error'
]
=
'non'
while
len
(
row
)
<
len
(
rec
)
:
row
.
append
({
'dist'
:
0
,
'error'
:
'non'
})
for
i
in
range
(
len
(
lab
))
:
self
.
space
[
i
][
0
][
'dist'
]
=
i
self
.
space
[
i
][
0
][
'error'
]
=
'del'
for
j
in
range
(
len
(
rec
))
:
self
.
space
[
0
][
j
][
'dist'
]
=
j
self
.
space
[
0
][
j
][
'error'
]
=
'ins'
self
.
space
[
0
][
0
][
'error'
]
=
'non'
for
token
in
lab
:
if
token
not
in
self
.
data
and
len
(
token
)
>
0
:
self
.
data
[
token
]
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
for
token
in
rec
:
if
token
not
in
self
.
data
and
len
(
token
)
>
0
:
self
.
data
[
token
]
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
# Computing edit distance
for
i
,
lab_token
in
enumerate
(
lab
)
:
for
j
,
rec_token
in
enumerate
(
rec
)
:
if
i
==
0
or
j
==
0
:
continue
min_dist
=
sys
.
maxsize
min_error
=
'none'
dist
=
self
.
space
[
i
-
1
][
j
][
'dist'
]
+
self
.
cost
[
'del'
]
error
=
'del'
if
dist
<
min_dist
:
min_dist
=
dist
min_error
=
error
dist
=
self
.
space
[
i
][
j
-
1
][
'dist'
]
+
self
.
cost
[
'ins'
]
error
=
'ins'
if
dist
<
min_dist
:
min_dist
=
dist
min_error
=
error
if
lab_token
==
rec_token
:
dist
=
self
.
space
[
i
-
1
][
j
-
1
][
'dist'
]
+
self
.
cost
[
'cor'
]
error
=
'cor'
else
:
dist
=
self
.
space
[
i
-
1
][
j
-
1
][
'dist'
]
+
self
.
cost
[
'sub'
]
error
=
'sub'
if
dist
<
min_dist
:
min_dist
=
dist
min_error
=
error
self
.
space
[
i
][
j
][
'dist'
]
=
min_dist
self
.
space
[
i
][
j
][
'error'
]
=
min_error
# Tracing back
result
=
{
'lab'
:[],
'rec'
:[],
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
i
=
len
(
lab
)
-
1
j
=
len
(
rec
)
-
1
while
True
:
if
self
.
space
[
i
][
j
][
'error'
]
==
'cor'
:
# correct
if
len
(
lab
[
i
])
>
0
:
self
.
data
[
lab
[
i
]][
'all'
]
=
self
.
data
[
lab
[
i
]][
'all'
]
+
1
self
.
data
[
lab
[
i
]][
'cor'
]
=
self
.
data
[
lab
[
i
]][
'cor'
]
+
1
result
[
'all'
]
=
result
[
'all'
]
+
1
result
[
'cor'
]
=
result
[
'cor'
]
+
1
result
[
'lab'
].
insert
(
0
,
lab
[
i
])
result
[
'rec'
].
insert
(
0
,
rec
[
j
])
i
=
i
-
1
j
=
j
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'sub'
:
# substitution
if
len
(
lab
[
i
])
>
0
:
self
.
data
[
lab
[
i
]][
'all'
]
=
self
.
data
[
lab
[
i
]][
'all'
]
+
1
self
.
data
[
lab
[
i
]][
'sub'
]
=
self
.
data
[
lab
[
i
]][
'sub'
]
+
1
result
[
'all'
]
=
result
[
'all'
]
+
1
result
[
'sub'
]
=
result
[
'sub'
]
+
1
result
[
'lab'
].
insert
(
0
,
lab
[
i
])
result
[
'rec'
].
insert
(
0
,
rec
[
j
])
i
=
i
-
1
j
=
j
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'del'
:
# deletion
if
len
(
lab
[
i
])
>
0
:
self
.
data
[
lab
[
i
]][
'all'
]
=
self
.
data
[
lab
[
i
]][
'all'
]
+
1
self
.
data
[
lab
[
i
]][
'del'
]
=
self
.
data
[
lab
[
i
]][
'del'
]
+
1
result
[
'all'
]
=
result
[
'all'
]
+
1
result
[
'del'
]
=
result
[
'del'
]
+
1
result
[
'lab'
].
insert
(
0
,
lab
[
i
])
result
[
'rec'
].
insert
(
0
,
""
)
i
=
i
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'ins'
:
# insertion
if
len
(
rec
[
j
])
>
0
:
self
.
data
[
rec
[
j
]][
'ins'
]
=
self
.
data
[
rec
[
j
]][
'ins'
]
+
1
result
[
'ins'
]
=
result
[
'ins'
]
+
1
result
[
'lab'
].
insert
(
0
,
""
)
result
[
'rec'
].
insert
(
0
,
rec
[
j
])
j
=
j
-
1
elif
self
.
space
[
i
][
j
][
'error'
]
==
'non'
:
# starting point
break
else
:
# shouldn't reach here
print
(
'this should not happen , i = {i} , j = {j} , error = {error}'
.
format
(
i
=
i
,
j
=
j
,
error
=
self
.
space
[
i
][
j
][
'error'
]))
return
result
def
overall
(
self
)
:
result
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
for
token
in
self
.
data
:
result
[
'all'
]
=
result
[
'all'
]
+
self
.
data
[
token
][
'all'
]
result
[
'cor'
]
=
result
[
'cor'
]
+
self
.
data
[
token
][
'cor'
]
result
[
'sub'
]
=
result
[
'sub'
]
+
self
.
data
[
token
][
'sub'
]
result
[
'ins'
]
=
result
[
'ins'
]
+
self
.
data
[
token
][
'ins'
]
result
[
'del'
]
=
result
[
'del'
]
+
self
.
data
[
token
][
'del'
]
return
result
def
cluster
(
self
,
data
)
:
result
=
{
'all'
:
0
,
'cor'
:
0
,
'sub'
:
0
,
'ins'
:
0
,
'del'
:
0
}
for
token
in
data
:
if
token
in
self
.
data
:
result
[
'all'
]
=
result
[
'all'
]
+
self
.
data
[
token
][
'all'
]
result
[
'cor'
]
=
result
[
'cor'
]
+
self
.
data
[
token
][
'cor'
]
result
[
'sub'
]
=
result
[
'sub'
]
+
self
.
data
[
token
][
'sub'
]
result
[
'ins'
]
=
result
[
'ins'
]
+
self
.
data
[
token
][
'ins'
]
result
[
'del'
]
=
result
[
'del'
]
+
self
.
data
[
token
][
'del'
]
return
result
def
keys
(
self
)
:
return
list
(
self
.
data
.
keys
())
def
width
(
string
):
return
sum
(
1
+
(
unicodedata
.
east_asian_width
(
c
)
in
"AFW"
)
for
c
in
string
)
def
default_cluster
(
word
)
:
unicode_names
=
[
unicodedata
.
name
(
char
)
for
char
in
word
]
for
i
in
reversed
(
range
(
len
(
unicode_names
)))
:
if
unicode_names
[
i
].
startswith
(
'DIGIT'
)
:
# 1
unicode_names
[
i
]
=
'Number'
# 'DIGIT'
elif
(
unicode_names
[
i
].
startswith
(
'CJK UNIFIED IDEOGRAPH'
)
or
unicode_names
[
i
].
startswith
(
'CJK COMPATIBILITY IDEOGRAPH'
))
:
# 明 / 郎
unicode_names
[
i
]
=
'Mandarin'
# 'CJK IDEOGRAPH'
elif
(
unicode_names
[
i
].
startswith
(
'LATIN CAPITAL LETTER'
)
or
unicode_names
[
i
].
startswith
(
'LATIN SMALL LETTER'
))
:
# A / a
unicode_names
[
i
]
=
'English'
# 'LATIN LETTER'
elif
unicode_names
[
i
].
startswith
(
'HIRAGANA LETTER'
)
:
# は こ め
unicode_names
[
i
]
=
'Japanese'
# 'GANA LETTER'
elif
(
unicode_names
[
i
].
startswith
(
'AMPERSAND'
)
or
unicode_names
[
i
].
startswith
(
'APOSTROPHE'
)
or
unicode_names
[
i
].
startswith
(
'COMMERCIAL AT'
)
or
unicode_names
[
i
].
startswith
(
'DEGREE CELSIUS'
)
or
unicode_names
[
i
].
startswith
(
'EQUALS SIGN'
)
or
unicode_names
[
i
].
startswith
(
'FULL STOP'
)
or
unicode_names
[
i
].
startswith
(
'HYPHEN-MINUS'
)
or
unicode_names
[
i
].
startswith
(
'LOW LINE'
)
or
unicode_names
[
i
].
startswith
(
'NUMBER SIGN'
)
or
unicode_names
[
i
].
startswith
(
'PLUS SIGN'
)
or
unicode_names
[
i
].
startswith
(
'SEMICOLON'
))
:
# & / ' / @ / ℃ / = / . / - / _ / # / + / ;
del
unicode_names
[
i
]
else
:
return
'Other'
if
len
(
unicode_names
)
==
0
:
return
'Other'
if
len
(
unicode_names
)
==
1
:
return
unicode_names
[
0
]
for
i
in
range
(
len
(
unicode_names
)
-
1
)
:
if
unicode_names
[
i
]
!=
unicode_names
[
i
+
1
]
:
return
'Other'
return
unicode_names
[
0
]
def
usage
()
:
print
(
"compute-wer.py : compute word error rate (WER) and align recognition results and references."
)
print
(
" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
==
1
:
usage
()
sys
.
exit
(
0
)
calculator
=
Calculator
()
cluster_file
=
''
ignore_words
=
set
()
tochar
=
False
verbose
=
1
padding_symbol
=
' '
case_sensitive
=
False
max_words_per_line
=
sys
.
maxsize
split
=
None
while
len
(
sys
.
argv
)
>
3
:
a
=
'--maxw='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
max_words_per_line
=
int
(
b
)
continue
a
=
'--rt='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
remove_tag
=
(
b
==
'true'
)
or
(
b
!=
'0'
)
continue
a
=
'--cs='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
case_sensitive
=
(
b
==
'true'
)
or
(
b
!=
'0'
)
continue
a
=
'--cluster='
if
sys
.
argv
[
1
].
startswith
(
a
):
cluster_file
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
continue
a
=
'--splitfile='
if
sys
.
argv
[
1
].
startswith
(
a
):
split_file
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
split
=
dict
()
with
codecs
.
open
(
split_file
,
'r'
,
'utf-8'
)
as
fh
:
for
line
in
fh
:
# line in unicode
words
=
line
.
strip
().
split
()
if
len
(
words
)
>=
2
:
split
[
words
[
0
]]
=
words
[
1
:]
continue
a
=
'--ig='
if
sys
.
argv
[
1
].
startswith
(
a
):
ignore_file
=
sys
.
argv
[
1
][
len
(
a
):]
del
sys
.
argv
[
1
]
with
codecs
.
open
(
ignore_file
,
'r'
,
'utf-8'
)
as
fh
:
for
line
in
fh
:
# line in unicode
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
ignore_words
.
add
(
line
)
continue
a
=
'--char='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
tochar
=
(
b
==
'true'
)
or
(
b
!=
'0'
)
continue
a
=
'--v='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
verbose
=
0
try
:
verbose
=
int
(
b
)
except
:
if
b
==
'true'
or
b
!=
'0'
:
verbose
=
1
continue
a
=
'--padding-symbol='
if
sys
.
argv
[
1
].
startswith
(
a
):
b
=
sys
.
argv
[
1
][
len
(
a
):].
lower
()
del
sys
.
argv
[
1
]
if
b
==
'space'
:
padding_symbol
=
' '
elif
b
==
'underline'
:
padding_symbol
=
'_'
continue
if
True
or
sys
.
argv
[
1
].
startswith
(
'-'
):
#ignore invalid switch
del
sys
.
argv
[
1
]
continue
if
not
case_sensitive
:
ig
=
set
([
w
.
upper
()
for
w
in
ignore_words
])
ignore_words
=
ig
default_clusters
=
{}
default_words
=
{}
ref_file
=
sys
.
argv
[
1
]
hyp_file
=
sys
.
argv
[
2
]
rec_set
=
{}
if
split
and
not
case_sensitive
:
newsplit
=
dict
()
for
w
in
split
:
words
=
split
[
w
]
for
i
in
range
(
len
(
words
)):
words
[
i
]
=
words
[
i
].
upper
()
newsplit
[
w
.
upper
()]
=
words
split
=
newsplit
with
codecs
.
open
(
hyp_file
,
'r'
,
'utf-8'
)
as
fh
:
for
line
in
fh
:
if
tochar
:
array
=
characterize
(
line
)
else
:
array
=
line
.
strip
().
split
()
if
len
(
array
)
==
0
:
continue
fid
=
array
[
0
]
rec_set
[
fid
]
=
normalize
(
array
[
1
:],
ignore_words
,
case_sensitive
,
split
)
# compute error rate on the interaction of reference file and hyp file
for
line
in
open
(
ref_file
,
'r'
,
encoding
=
'utf-8'
)
:
if
tochar
:
array
=
characterize
(
line
)
else
:
array
=
line
.
rstrip
(
'
\n
'
).
split
()
if
len
(
array
)
==
0
:
continue
fid
=
array
[
0
]
if
fid
not
in
rec_set
:
continue
lab
=
normalize
(
array
[
1
:],
ignore_words
,
case_sensitive
,
split
)
rec
=
rec_set
[
fid
]
if
verbose
:
print
(
'
\n
utt: %s'
%
fid
)
for
word
in
rec
+
lab
:
if
word
not
in
default_words
:
default_cluster_name
=
default_cluster
(
word
)
if
default_cluster_name
not
in
default_clusters
:
default_clusters
[
default_cluster_name
]
=
{}
if
word
not
in
default_clusters
[
default_cluster_name
]
:
default_clusters
[
default_cluster_name
][
word
]
=
1
default_words
[
word
]
=
default_cluster_name
result
=
calculator
.
calculate
(
lab
,
rec
)
if
verbose
:
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'WER: %4.2f %%'
%
wer
,
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
space
=
{}
space
[
'lab'
]
=
[]
space
[
'rec'
]
=
[]
for
idx
in
range
(
len
(
result
[
'lab'
]))
:
len_lab
=
width
(
result
[
'lab'
][
idx
])
len_rec
=
width
(
result
[
'rec'
][
idx
])
length
=
max
(
len_lab
,
len_rec
)
space
[
'lab'
].
append
(
length
-
len_lab
)
space
[
'rec'
].
append
(
length
-
len_rec
)
upper_lab
=
len
(
result
[
'lab'
])
upper_rec
=
len
(
result
[
'rec'
])
lab1
,
rec1
=
0
,
0
while
lab1
<
upper_lab
or
rec1
<
upper_rec
:
if
verbose
>
1
:
print
(
'lab(%s):'
%
fid
.
encode
(
'utf-8'
),
end
=
' '
)
else
:
print
(
'lab:'
,
end
=
' '
)
lab2
=
min
(
upper_lab
,
lab1
+
max_words_per_line
)
for
idx
in
range
(
lab1
,
lab2
):
token
=
result
[
'lab'
][
idx
]
print
(
'{token}'
.
format
(
token
=
token
),
end
=
''
)
for
n
in
range
(
space
[
'lab'
][
idx
])
:
print
(
padding_symbol
,
end
=
''
)
print
(
' '
,
end
=
''
)
print
()
if
verbose
>
1
:
print
(
'rec(%s):'
%
fid
.
encode
(
'utf-8'
),
end
=
' '
)
else
:
print
(
'rec:'
,
end
=
' '
)
rec2
=
min
(
upper_rec
,
rec1
+
max_words_per_line
)
for
idx
in
range
(
rec1
,
rec2
):
token
=
result
[
'rec'
][
idx
]
print
(
'{token}'
.
format
(
token
=
token
),
end
=
''
)
for
n
in
range
(
space
[
'rec'
][
idx
])
:
print
(
padding_symbol
,
end
=
''
)
print
(
' '
,
end
=
''
)
print
(
'
\n
'
,
end
=
'
\n
'
)
lab1
=
lab2
rec1
=
rec2
if
verbose
:
print
(
'==========================================================================='
)
print
()
result
=
calculator
.
overall
()
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'Overall -> %4.2f %%'
%
wer
,
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
if
not
verbose
:
print
()
if
verbose
:
for
cluster_id
in
default_clusters
:
result
=
calculator
.
cluster
([
k
for
k
in
default_clusters
[
cluster_id
]
])
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'%s -> %4.2f %%'
%
(
cluster_id
,
wer
),
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
if
len
(
cluster_file
)
>
0
:
# compute separated WERs for word clusters
cluster_id
=
''
cluster
=
[]
for
line
in
open
(
cluster_file
,
'r'
,
encoding
=
'utf-8'
)
:
for
token
in
line
.
decode
(
'utf-8'
).
rstrip
(
'
\n
'
).
split
()
:
# end of cluster reached, like </Keyword>
if
token
[
0
:
2
]
==
'</'
and
token
[
len
(
token
)
-
1
]
==
'>'
and
\
token
.
lstrip
(
'</'
).
rstrip
(
'>'
)
==
cluster_id
:
result
=
calculator
.
cluster
(
cluster
)
if
result
[
'all'
]
!=
0
:
wer
=
float
(
result
[
'ins'
]
+
result
[
'sub'
]
+
result
[
'del'
])
*
100.0
/
result
[
'all'
]
else
:
wer
=
0.0
print
(
'%s -> %4.2f %%'
%
(
cluster_id
,
wer
),
end
=
' '
)
print
(
'N=%d C=%d S=%d D=%d I=%d'
%
(
result
[
'all'
],
result
[
'cor'
],
result
[
'sub'
],
result
[
'del'
],
result
[
'ins'
]))
cluster_id
=
''
cluster
=
[]
# begin of cluster reached, like <Keyword>
elif
token
[
0
]
==
'<'
and
token
[
len
(
token
)
-
1
]
==
'>'
and
\
cluster_id
==
''
:
cluster_id
=
token
.
lstrip
(
'<'
).
rstrip
(
'>'
)
cluster
=
[]
# general terms, like WEATHER / CAR / ...
else
:
cluster
.
append
(
token
)
print
()
print
(
'==========================================================================='
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录