Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
7db13ca9
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7db13ca9
编写于
6月 13, 2017
作者:
Y
Yibing Liu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
enable lm in multiprocessing decoder & add script for params tuning
上级
bb34e903
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
257 addition
and
9 deletion
+257
-9
decoder.py
decoder.py
+18
-5
infer.py
infer.py
+5
-4
tune.py
tune.py
+234
-0
未找到文件。
decoder.py
浏览文件 @
7db13ca9
...
@@ -73,7 +73,7 @@ class Scorer(object):
...
@@ -73,7 +73,7 @@ class Scorer(object):
return
len
(
words
)
return
len
(
words
)
# execute evaluation
# execute evaluation
def
evaluate
(
self
,
sentence
):
def
__call__
(
self
,
sentence
):
lm
=
self
.
language_model_score
(
sentence
)
lm
=
self
.
language_model_score
(
sentence
)
word_cnt
=
self
.
word_count
(
sentence
)
word_cnt
=
self
.
word_count
(
sentence
)
score
=
np
.
power
(
lm
,
self
.
_alpha
)
\
score
=
np
.
power
(
lm
,
self
.
_alpha
)
\
...
@@ -84,8 +84,9 @@ class Scorer(object):
...
@@ -84,8 +84,9 @@ class Scorer(object):
def
ctc_beam_search_decoder
(
probs_seq
,
def
ctc_beam_search_decoder
(
probs_seq
,
beam_size
,
beam_size
,
vocabulary
,
vocabulary
,
blank_id
=
0
,
ext_scoring_func
=
None
,
ext_scoring_func
=
None
,
blank_id
=
0
):
nproc
=
False
):
'''
'''
Beam search decoder for CTC-trained network, using beam search with width
Beam search decoder for CTC-trained network, using beam search with width
beam_size to find many paths to one label, return beam_size labels in
beam_size to find many paths to one label, return beam_size labels in
...
@@ -107,6 +108,8 @@ def ctc_beam_search_decoder(probs_seq,
...
@@ -107,6 +108,8 @@ def ctc_beam_search_decoder(probs_seq,
:type external_scoring_function: function
:type external_scoring_function: function
:param blank_id: id of blank, default 0.
:param blank_id: id of blank, default 0.
:type blank_id: int
:type blank_id: int
:param nproc: Whether the decoder used in multiprocesses.
:type nproc: bool
:return: Decoding log probability and result string.
:return: Decoding log probability and result string.
:rtype: list
:rtype: list
...
@@ -122,6 +125,12 @@ def ctc_beam_search_decoder(probs_seq,
...
@@ -122,6 +125,12 @@ def ctc_beam_search_decoder(probs_seq,
if
not
blank_id
<
probs_dim
:
if
not
blank_id
<
probs_dim
:
raise
ValueError
(
"blank_id shouldn't be greater than probs dimension"
)
raise
ValueError
(
"blank_id shouldn't be greater than probs dimension"
)
# If the decoder called in the multiprocesses, then use the global scorer
# instantiated in ctc_beam_search_decoder_nproc().
if
nproc
is
True
:
global
ext_nproc_scorer
ext_scoring_func
=
ext_nproc_scorer
## initialize
## initialize
# the set containing selected prefixes
# the set containing selected prefixes
prefix_set_prev
=
{
'
\t
'
:
1.0
}
prefix_set_prev
=
{
'
\t
'
:
1.0
}
...
@@ -193,8 +202,8 @@ def ctc_beam_search_decoder(probs_seq,
...
@@ -193,8 +202,8 @@ def ctc_beam_search_decoder(probs_seq,
def
ctc_beam_search_decoder_nproc
(
probs_split
,
def
ctc_beam_search_decoder_nproc
(
probs_split
,
beam_size
,
beam_size
,
vocabulary
,
vocabulary
,
ext_scoring_func
=
None
,
blank_id
=
0
,
blank_id
=
0
,
ext_scoring_func
=
None
,
num_processes
=
None
):
num_processes
=
None
):
'''
'''
Beam search decoder using multiple processes.
Beam search decoder using multiple processes.
...
@@ -202,7 +211,6 @@ def ctc_beam_search_decoder_nproc(probs_split,
...
@@ -202,7 +211,6 @@ def ctc_beam_search_decoder_nproc(probs_split,
:param probs_seq: 3-D list with length batch_size, each element
:param probs_seq: 3-D list with length batch_size, each element
is a 2-D list of probabilities can be used by
is a 2-D list of probabilities can be used by
ctc_beam_search_decoder.
ctc_beam_search_decoder.
:type probs_seq: 3-D list
:type probs_seq: 3-D list
:param beam_size: Width for beam search.
:param beam_size: Width for beam search.
:type beam_size: int
:type beam_size: int
...
@@ -227,10 +235,15 @@ def ctc_beam_search_decoder_nproc(probs_split,
...
@@ -227,10 +235,15 @@ def ctc_beam_search_decoder_nproc(probs_split,
if
not
num_processes
>
0
:
if
not
num_processes
>
0
:
raise
ValueError
(
"Number of processes must be positive!"
)
raise
ValueError
(
"Number of processes must be positive!"
)
# use global variable to pass the externnal scorer to beam search decoder
global
ext_nproc_scorer
ext_nproc_scorer
=
ext_scoring_func
nproc
=
True
pool
=
multiprocessing
.
Pool
(
processes
=
num_processes
)
pool
=
multiprocessing
.
Pool
(
processes
=
num_processes
)
results
=
[]
results
=
[]
for
i
,
probs_list
in
enumerate
(
probs_split
):
for
i
,
probs_list
in
enumerate
(
probs_split
):
args
=
(
probs_list
,
beam_size
,
vocabulary
,
ext_scoring_func
,
blank_id
)
args
=
(
probs_list
,
beam_size
,
vocabulary
,
blank_id
,
None
,
nproc
)
results
.
append
(
pool
.
apply_async
(
ctc_beam_search_decoder
,
args
))
results
.
append
(
pool
.
apply_async
(
ctc_beam_search_decoder
,
args
))
pool
.
close
()
pool
.
close
()
...
...
infer.py
浏览文件 @
7db13ca9
...
@@ -9,6 +9,7 @@ import gzip
...
@@ -9,6 +9,7 @@ import gzip
from
audio_data_utils
import
DataGenerator
from
audio_data_utils
import
DataGenerator
from
model
import
deep_speech2
from
model
import
deep_speech2
from
decoder
import
*
from
decoder
import
*
import
kenlm
from
error_rate
import
wer
from
error_rate
import
wer
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
...
@@ -176,7 +177,7 @@ def infer():
...
@@ -176,7 +177,7 @@ def infer():
probs_seq
=
probs
,
probs_seq
=
probs
,
vocabulary
=
vocab_list
,
vocabulary
=
vocab_list
,
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
ext_scoring_func
=
ext_scorer
.
evaluate
,
ext_scoring_func
=
ext_scorer
,
blank_id
=
len
(
vocab_list
))
blank_id
=
len
(
vocab_list
))
print
(
"
\n
Target Transcription:
\t
%s"
%
target_transcription
)
print
(
"
\n
Target Transcription:
\t
%s"
%
target_transcription
)
...
@@ -196,9 +197,9 @@ def infer():
...
@@ -196,9 +197,9 @@ def infer():
probs_split
=
probs_split
,
probs_split
=
probs_split
,
vocabulary
=
vocab_list
,
vocabulary
=
vocab_list
,
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
#ext_scoring_func=ext_scorer.evaluate
,
ext_scoring_func
=
ext_scorer
,
ext_scoring_func
=
None
,
blank_id
=
len
(
vocab_list
)
,
blank_id
=
len
(
vocab_list
)
)
num_processes
=
1
)
for
i
,
beam_search_result
in
enumerate
(
beam_search_nproc_results
):
for
i
,
beam_search_result
in
enumerate
(
beam_search_nproc_results
):
target_transcription
=
''
.
join
(
target_transcription
=
''
.
join
(
[
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]])
[
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]])
...
...
tune.py
0 → 100644
浏览文件 @
7db13ca9
"""
Tune parameters for beam search decoder in Deep Speech 2.
"""
import
paddle.v2
as
paddle
import
distutils.util
import
argparse
import
gzip
from
audio_data_utils
import
DataGenerator
from
model
import
deep_speech2
from
decoder
import
*
from
error_rate
import
wer
parser
=
argparse
.
ArgumentParser
(
description
=
'Parameters tuning script for ctc beam search decoder in Deep Speech 2.'
)
parser
.
add_argument
(
"--num_samples"
,
default
=
100
,
type
=
int
,
help
=
"Number of samples for parameters tuning. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
512
,
type
=
int
,
help
=
"RNN layer cell number. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--normalizer_manifest_path"
,
default
=
'data/manifest.libri.train-clean-100'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_manifest_path"
,
default
=
'data/manifest.libri.test-100sample'
,
type
=
str
,
help
=
"Manifest path for decoding. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'./params.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'data/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_method"
,
default
=
'beam_search_nproc'
,
type
=
str
,
help
=
"Method for decoding, beam_search or beam_search_nproc. (default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--num_results_per_sample"
,
default
=
1
,
type
=
int
,
help
=
"Number of outputs per sample in beam search. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"./data/1Billion.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha_from"
,
default
=
0.0
,
type
=
float
,
help
=
"Where alpha starts from, <= alpha_to. (default: %(default)f)"
)
parser
.
add_argument
(
"--alpha_stride"
,
default
=
0.001
,
type
=
float
,
help
=
"Step length for varying alpha. (default: %(default)f)"
)
parser
.
add_argument
(
"--alpha_to"
,
default
=
0.01
,
type
=
float
,
help
=
"Where alpha ends with, >= alpha_from. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta_from"
,
default
=
0.0
,
type
=
float
,
help
=
"Where beta starts from, <= beta_to. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta_stride"
,
default
=
0.01
,
type
=
float
,
help
=
"Step length for varying beta. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta_to"
,
default
=
0.0
,
type
=
float
,
help
=
"Where beta ends with, >= beta_from. (default: %(default)f)"
)
args
=
parser
.
parse_args
()
def
tune
():
"""
Tune parameters alpha and beta on one minibatch.
"""
if
not
args
.
alpha_from
<=
args
.
alpha_to
:
raise
ValueError
(
"alpha_from <= alpha_to doesn't satisfy!"
)
if
not
args
.
alpha_stride
>
0
:
raise
ValueError
(
"alpha_stride shouldn't be negative!"
)
if
not
args
.
beta_from
<=
args
.
beta_to
:
raise
ValueError
(
"beta_from <= beta_to doesn't satisfy!"
)
if
not
args
.
beta_stride
>
0
:
raise
ValueError
(
"beta_stride shouldn't be negative!"
)
# initialize data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
normalizer_manifest_path
=
args
.
normalizer_manifest_path
,
normalizer_num_samples
=
200
,
max_duration
=
20.0
,
min_duration
=
0.0
,
stride_ms
=
10
,
window_ms
=
20
)
# create network config
dict_size
=
data_generator
.
vocabulary_size
()
vocab_list
=
data_generator
.
vocabulary_list
()
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
height
=
161
,
width
=
2000
,
type
=
paddle
.
data_type
.
dense_vector
(
322000
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
output_probs
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
dict_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
True
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
args
.
model_filepath
))
# prepare infer data
feeding
=
data_generator
.
data_name_feeding
()
test_batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
batch_size
=
args
.
num_samples
,
padding_to
=
2000
,
flatten
=
True
,
sort_by_duration
=
False
,
shuffle
=
False
)
infer_data
=
test_batch_reader
().
next
()
# run inference
infer_results
=
paddle
.
infer
(
output_layer
=
output_probs
,
parameters
=
parameters
,
input
=
infer_data
)
num_steps
=
len
(
infer_results
)
/
len
(
infer_data
)
probs_split
=
[
infer_results
[
i
*
num_steps
:(
i
+
1
)
*
num_steps
]
for
i
in
xrange
(
0
,
len
(
infer_data
))
]
cand_alpha
=
np
.
arange
(
args
.
alpha_from
,
args
.
alpha_to
+
args
.
alpha_stride
,
args
.
alpha_stride
)
cand_beta
=
np
.
arange
(
args
.
beta_from
,
args
.
beta_to
+
args
.
beta_stride
,
args
.
beta_stride
)
params_grid
=
[(
alpha
,
beta
)
for
alpha
in
cand_alpha
for
beta
in
cand_beta
]
## tune parameters in loop
for
(
alpha
,
beta
)
in
params_grid
:
wer_sum
,
wer_counter
=
0
,
0
ext_scorer
=
Scorer
(
alpha
,
beta
,
args
.
language_model_path
)
# beam search decode
if
args
.
decode_method
==
"beam_search"
:
for
i
,
probs
in
enumerate
(
probs_split
):
target_transcription
=
''
.
join
(
[
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]])
beam_search_result
=
ctc_beam_search_decoder
(
probs_seq
=
probs
,
vocabulary
=
vocab_list
,
beam_size
=
args
.
beam_size
,
ext_scoring_func
=
ext_scorer
,
blank_id
=
len
(
vocab_list
))
wer_sum
+=
wer
(
target_transcription
,
beam_search_result
[
0
][
1
])
wer_counter
+=
1
# beam search using multiple processes
elif
args
.
decode_method
==
"beam_search_nproc"
:
beam_search_nproc_results
=
ctc_beam_search_decoder_nproc
(
probs_split
=
probs_split
,
vocabulary
=
vocab_list
,
beam_size
=
args
.
beam_size
,
ext_scoring_func
=
ext_scorer
,
blank_id
=
len
(
vocab_list
),
num_processes
=
1
)
for
i
,
beam_search_result
in
enumerate
(
beam_search_nproc_results
):
target_transcription
=
''
.
join
(
[
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]])
wer_sum
+=
wer
(
target_transcription
,
beam_search_result
[
0
][
1
])
wer_counter
+=
1
else
:
raise
ValueError
(
"Decoding method [%s] is not supported."
%
method
)
print
(
"alpha = %f
\t
beta = %f
\t
WER = %f"
%
(
alpha
,
beta
,
wer_sum
/
wer_counter
))
def
main
():
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
tune
()
if
__name__
==
'__main__'
:
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录