Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
8122dd9c
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8122dd9c
编写于
8月 01, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Simplify train.py, evaluate.py, infer.py and tune.py by adding DeepSpeech2Model class.
上级
92eacf54
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
415 addition
and
441 deletion
+415
-441
evaluate.py
evaluate.py
+27
-80
infer.py
infer.py
+24
-82
layer.py
layer.py
+155
-0
model.py
model.py
+137
-128
train.py
train.py
+35
-86
tune.py
tune.py
+37
-65
未找到文件。
evaluate.py
浏览文件 @
8122dd9c
...
...
@@ -4,14 +4,11 @@ from __future__ import division
from
__future__
import
print_function
import
distutils.util
import
sys
import
argparse
import
gzip
import
multiprocessing
import
paddle.v2
as
paddle
from
data_utils.data
import
DataGenerator
from
model
import
deep_speech2
from
decoder
import
*
from
lm.lm_scorer
import
LmScorer
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
import
utils
...
...
@@ -119,37 +116,12 @@ args = parser.parse_args()
def
evaluate
():
"""Evaluate on whole test data for DeepSpeech2."""
# initialize data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
type
=
paddle
.
data_type
.
dense_array
(
161
*
161
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
data_generator
.
vocab_size
))
output_probs
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
data_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
True
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
args
.
model_filepath
))
# prepare infer data
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
batch_size
=
args
.
batch_size
,
...
...
@@ -157,59 +129,34 @@ def evaluate():
sortagrad
=
False
,
shuffle_method
=
None
)
# define inferer
inferer
=
paddle
.
inference
.
Inference
(
output_layer
=
output_probs
,
parameters
=
parameters
)
# initialize external scorer for beam search decoding
if
args
.
decode_method
==
'beam_search'
:
ext_scorer
=
LmScorer
(
args
.
alpha
,
args
.
beta
,
args
.
language_model_path
)
ds2_model
=
DeepSpeech2Model
(
vocab_size
=
data_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
pretrained_model_path
=
args
.
model_filepath
)
wer_
counter
,
wer_sum
=
0
,
0.
0
wer_
sum
,
num_ins
=
0.0
,
0
for
infer_data
in
batch_reader
():
# run inference
infer_results
=
inferer
.
infer
(
input
=
infer_data
)
num_steps
=
len
(
infer_results
)
//
len
(
infer_data
)
probs_split
=
[
infer_results
[
i
*
num_steps
:(
i
+
1
)
*
num_steps
]
for
i
in
xrange
(
0
,
len
(
infer_data
))
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
infer_data
,
decode_method
=
args
.
decode_method
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
language_model_path
,
num_processes
=
args
.
num_processes_beam_search
)
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
]
# target transcription
target_transcription
=
[
''
.
join
([
data_generator
.
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]
])
for
i
,
probs
in
enumerate
(
probs_split
)
]
# decode and print
# best path decode
if
args
.
decode_method
==
"best_path"
:
for
i
,
probs
in
enumerate
(
probs_split
):
output_transcription
=
ctc_best_path_decoder
(
probs_seq
=
probs
,
vocabulary
=
data_generator
.
vocab_list
)
wer_sum
+=
wer
(
target_transcription
[
i
],
output_transcription
)
wer_counter
+=
1
# beam search decode
elif
args
.
decode_method
==
"beam_search"
:
# beam search using multiple processes
beam_search_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
vocabulary
=
data_generator
.
vocab_list
,
beam_size
=
args
.
beam_size
,
blank_id
=
len
(
data_generator
.
vocab_list
),
num_processes
=
args
.
num_processes_beam_search
,
ext_scoring_func
=
ext_scorer
,
cutoff_prob
=
args
.
cutoff_prob
)
for
i
,
beam_search_result
in
enumerate
(
beam_search_results
):
wer_sum
+=
wer
(
target_transcription
[
i
],
beam_search_result
[
0
][
1
])
wer_counter
+=
1
else
:
raise
ValueError
(
"Decoding method [%s] is not supported."
%
decode_method
)
print
(
"WER (%d/?) = %f"
%
(
wer_counter
,
wer_sum
/
wer_counter
))
print
(
"Final WER (%d/%d) = %f"
%
(
wer_counter
,
wer_counter
,
wer_sum
/
wer_counter
))
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
wer_sum
+=
wer
(
target
,
result
)
num_ins
+=
1
print
(
"WER (%d/?) = %f"
%
(
num_ins
,
wer_sum
/
num_ins
))
print
(
"Final WER (%d/%d) = %f"
%
(
num_ins
,
num_ins
,
wer_sum
/
num_ins
))
def
main
():
...
...
infer.py
浏览文件 @
8122dd9c
...
...
@@ -4,14 +4,11 @@ from __future__ import division
from
__future__
import
print_function
import
argparse
import
gzip
import
distutils.util
import
multiprocessing
import
paddle.v2
as
paddle
from
data_utils.data
import
DataGenerator
from
model
import
deep_speech2
from
decoder
import
*
from
lm.lm_scorer
import
LmScorer
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
import
utils
...
...
@@ -124,37 +121,12 @@ args = parser.parse_args()
def
infer
():
"""Inference for DeepSpeech2."""
# initialize data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
type
=
paddle
.
data_type
.
dense_array
(
161
*
161
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
data_generator
.
vocab_size
))
output_probs
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
data_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
True
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
args
.
model_filepath
))
# prepare infer data
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
batch_size
=
args
.
num_samples
,
...
...
@@ -163,61 +135,31 @@ def infer():
shuffle_method
=
None
)
infer_data
=
batch_reader
().
next
()
# run inference
infer_results
=
paddle
.
infer
(
output_layer
=
output_probs
,
parameters
=
parameters
,
input
=
infer_data
)
num_steps
=
len
(
infer_results
)
//
len
(
infer_data
)
probs_split
=
[
infer_results
[
i
*
num_steps
:(
i
+
1
)
*
num_steps
]
for
i
in
xrange
(
len
(
infer_data
))
]
ds2_model
=
DeepSpeech2Model
(
vocab_size
=
data_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
pretrained_model_path
=
args
.
model_filepath
)
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
infer_data
,
decode_method
=
args
.
decode_method
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
language_model_path
,
num_processes
=
args
.
num_processes_beam_search
)
# targe transcription
target_transcription
=
[
''
.
join
(
[
data_generator
.
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]])
for
i
,
probs
in
enumerate
(
probs_split
)
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
]
## decode and print
# best path decode
wer_sum
,
wer_counter
=
0
,
0
if
args
.
decode_method
==
"best_path"
:
for
i
,
probs
in
enumerate
(
probs_split
):
best_path_transcription
=
ctc_best_path_decoder
(
probs_seq
=
probs
,
vocabulary
=
data_generator
.
vocab_list
)
print
(
"
\n
Target Transcription: %s
\n
Output Transcription: %s"
%
(
target_transcription
[
i
],
best_path_transcription
))
wer_cur
=
wer
(
target_transcription
[
i
],
best_path_transcription
)
wer_sum
+=
wer_cur
wer_counter
+=
1
print
(
"cur wer = %f, average wer = %f"
%
(
wer_cur
,
wer_sum
/
wer_counter
))
# beam search decode
elif
args
.
decode_method
==
"beam_search"
:
ext_scorer
=
LmScorer
(
args
.
alpha
,
args
.
beta
,
args
.
language_model_path
)
beam_search_batch_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
vocabulary
=
data_generator
.
vocab_list
,
beam_size
=
args
.
beam_size
,
blank_id
=
len
(
data_generator
.
vocab_list
),
num_processes
=
args
.
num_processes_beam_search
,
cutoff_prob
=
args
.
cutoff_prob
,
ext_scoring_func
=
ext_scorer
,
)
for
i
,
beam_search_result
in
enumerate
(
beam_search_batch_results
):
print
(
"
\n
Target Transcription:
\t
%s"
%
target_transcription
[
i
])
for
index
in
xrange
(
args
.
num_results_per_sample
):
result
=
beam_search_result
[
index
]
#output: index, log prob, beam result
print
(
"Beam %d: %f
\t
%s"
%
(
index
,
result
[
0
],
result
[
1
]))
wer_cur
=
wer
(
target_transcription
[
i
],
beam_search_result
[
0
][
1
])
wer_sum
+=
wer_cur
wer_counter
+=
1
print
(
"Current WER = %f , Average WER = %f"
%
(
wer_cur
,
wer_sum
/
wer_counter
))
else
:
raise
ValueError
(
"Decoding method [%s] is not supported."
%
decode_method
)
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
print
(
"
\n
Target Transcription: %s
\n
Output Transcription: %s"
%
(
target
,
result
))
print
(
"Current wer = %f"
%
wer
(
target
,
result
))
def
main
():
...
...
layer.py
0 → 100644
浏览文件 @
8122dd9c
"""Contains DeepSpeech2 layers."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
paddle.v2
as
paddle
DISABLE_CUDNN_BATCH_NORM
=
True
def
conv_bn_layer
(
input
,
filter_size
,
num_channels_in
,
num_channels_out
,
stride
,
padding
,
act
):
"""
Convolution layer with batch normalization.
"""
conv_layer
=
paddle
.
layer
.
img_conv
(
input
=
input
,
filter_size
=
filter_size
,
num_channels
=
num_channels_in
,
num_filters
=
num_channels_out
,
stride
=
stride
,
padding
=
padding
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
if
DISABLE_CUDNN_BATCH_NORM
:
# temopary patch, need to be removed.
return
paddle
.
layer
.
batch_norm
(
input
=
conv_layer
,
act
=
act
,
batch_norm_type
=
"batch_norm"
)
else
:
return
paddle
.
layer
.
batch_norm
(
input
=
conv_layer
,
act
=
act
)
def
bidirectional_simple_rnn_bn_layer
(
name
,
input
,
size
,
act
):
"""
Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj
=
paddle
.
layer
.
fc
(
input
=
input
,
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
# batch norm is only performed on input-state projection
if
DISABLE_CUDNN_BATCH_NORM
:
# temopary patch, need to be removed.
input_proj_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_proj
,
act
=
paddle
.
activation
.
Linear
(),
batch_norm_type
=
"batch_norm"
)
else
:
input_proj_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_proj
,
act
=
paddle
.
activation
.
Linear
())
# forward and backward in time
forward_simple_rnn
=
paddle
.
layer
.
recurrent
(
input
=
input_proj_bn
,
act
=
act
,
reverse
=
False
)
backward_simple_rnn
=
paddle
.
layer
.
recurrent
(
input
=
input_proj_bn
,
act
=
act
,
reverse
=
True
)
return
paddle
.
layer
.
concat
(
input
=
[
forward_simple_rnn
,
backward_simple_rnn
])
def
conv_group
(
input
,
num_stacks
):
"""
Convolution group with several stacking convolution layers.
"""
conv
=
conv_bn_layer
(
input
=
input
,
filter_size
=
(
11
,
41
),
num_channels_in
=
1
,
num_channels_out
=
32
,
stride
=
(
3
,
2
),
padding
=
(
5
,
20
),
act
=
paddle
.
activation
.
BRelu
())
for
i
in
xrange
(
num_stacks
-
1
):
conv
=
conv_bn_layer
(
input
=
conv
,
filter_size
=
(
11
,
21
),
num_channels_in
=
32
,
num_channels_out
=
32
,
stride
=
(
1
,
2
),
padding
=
(
5
,
10
),
act
=
paddle
.
activation
.
BRelu
())
output_num_channels
=
32
output_height
=
160
//
pow
(
2
,
num_stacks
)
+
1
return
conv
,
output_num_channels
,
output_height
def
rnn_group
(
input
,
size
,
num_stacks
):
"""
RNN group with several stacking RNN layers.
"""
output
=
input
for
i
in
xrange
(
num_stacks
):
output
=
bidirectional_simple_rnn_bn_layer
(
name
=
str
(
i
),
input
=
output
,
size
=
size
,
act
=
paddle
.
activation
.
BRelu
())
return
output
def
deep_speech2
(
audio_data
,
text_data
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
256
):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:param is_inference: False in the training mode, and True in the
inferene mode.
:type is_inference: bool
:return: If is_inference set False, return a ctc cost layer;
if is_inference set True, return a sequence layer of output
probability distribution.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output
,
conv_group_num_channels
,
conv_group_height
=
conv_group
(
input
=
audio_data
,
num_stacks
=
num_conv_layers
)
# convert data form convolution feature map to sequence of vectors
conv2seq
=
paddle
.
layer
.
block_expand
(
input
=
conv_group_output
,
num_channels
=
conv_group_num_channels
,
stride_x
=
1
,
stride_y
=
1
,
block_x
=
1
,
block_y
=
conv_group_height
)
# rnn group
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
rnn_size
,
num_stacks
=
num_rnn_layers
)
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
# probability distribution with softmax
log_probs
=
paddle
.
layer
.
mixed
(
input
=
paddle
.
layer
.
identity_projection
(
input
=
fc
),
act
=
paddle
.
activation
.
Softmax
())
# ctc cost
ctc_loss
=
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
return
log_probs
,
ctc_loss
model.py
浏览文件 @
8122dd9c
...
...
@@ -3,141 +3,150 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
os
import
time
import
gzip
from
decoder
import
*
from
lm.lm_scorer
import
LmScorer
import
paddle.v2
as
paddle
from
layer
import
*
def
conv_bn_layer
(
input
,
filter_size
,
num_channels_in
,
num_channels_out
,
stride
,
padding
,
act
):
"""
Convolution layer with batch normalization.
"""
conv_layer
=
paddle
.
layer
.
img_conv
(
input
=
input
,
filter_size
=
filter_size
,
num_channels
=
num_channels_in
,
num_filters
=
num_channels_out
,
stride
=
stride
,
padding
=
padding
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
return
paddle
.
layer
.
batch_norm
(
input
=
conv_layer
,
act
=
act
)
class
DeepSpeech2Model
(
object
):
def
__init__
(
self
,
vocab_size
,
num_conv_layers
,
num_rnn_layers
,
rnn_layer_size
,
pretrained_model_path
):
self
.
_create_network
(
vocab_size
,
num_conv_layers
,
num_rnn_layers
,
rnn_layer_size
)
self
.
_create_parameters
(
pretrained_model_path
)
self
.
_inferer
=
None
self
.
_ext_scorer
=
None
def
train
(
self
,
train_batch_reader
,
dev_batch_reader
,
feeding_dict
,
learning_rate
,
gradient_clipping
,
num_passes
,
num_iterations_print
=
100
,
output_model_dir
=
'checkpoints'
):
# prepare optimizer and trainer
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
learning_rate
,
gradient_clipping_threshold
=
gradient_clipping
)
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
self
.
_loss
,
parameters
=
self
.
_parameters
,
update_equation
=
optimizer
)
def
bidirectional_simple_rnn_bn_layer
(
name
,
input
,
size
,
act
):
"""
Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
"""
# input-hidden weights shared across bi-direcitonal rnn.
input_proj
=
paddle
.
layer
.
fc
(
input
=
input
,
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
# batch norm is only performed on input-state projection
input_proj_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_proj
,
act
=
paddle
.
activation
.
Linear
())
# forward and backward in time
forward_simple_rnn
=
paddle
.
layer
.
recurrent
(
input
=
input_proj_bn
,
act
=
act
,
reverse
=
False
)
backward_simple_rnn
=
paddle
.
layer
.
recurrent
(
input
=
input_proj_bn
,
act
=
act
,
reverse
=
True
)
return
paddle
.
layer
.
concat
(
input
=
[
forward_simple_rnn
,
backward_simple_rnn
])
# create event handler
def
event_handler
(
event
):
global
start_time
,
cost_sum
,
cost_counter
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
cost_sum
+=
event
.
cost
cost_counter
+=
1
if
(
event
.
batch_id
+
1
)
%
num_iterations_print
==
0
:
output_model_path
=
os
.
path
.
join
(
output_model_dir
,
"params.latest.tar.gz"
)
with
gzip
.
open
(
output_model_path
,
'w'
)
as
f
:
self
.
_parameters
.
to_tar
(
f
)
print
(
"
\n
Pass: %d, Batch: %d, TrainCost: %f"
%
(
event
.
pass_id
,
event
.
batch_id
+
1
,
cost_sum
/
cost_counter
))
cost_sum
,
cost_counter
=
0.0
,
0
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
BeginPass
):
start_time
=
time
.
time
()
cost_sum
,
cost_counter
=
0.0
,
0
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
dev_batch_reader
,
feeding
=
feeding_dict
)
output_model_path
=
os
.
path
.
join
(
output_model_dir
,
"params.pass-%d.tar.gz"
%
event
.
pass_id
)
with
gzip
.
open
(
output_model_path
,
'w'
)
as
f
:
self
.
_parameters
.
to_tar
(
f
)
print
(
"
\n
------- Time: %d sec, Pass: %d, ValidationCost: %s"
%
(
time
.
time
()
-
start_time
,
event
.
pass_id
,
result
.
cost
))
# run train
trainer
.
train
(
reader
=
train_batch_reader
,
event_handler
=
event_handler
,
num_passes
=
num_passes
,
feeding
=
feeding_dict
)
def
conv_group
(
input
,
num_stacks
):
"""
Convolution group with several stacking convolution layers.
"""
conv
=
conv_bn_layer
(
input
=
input
,
filter_size
=
(
11
,
41
),
num_channels_in
=
1
,
num_channels_out
=
32
,
stride
=
(
3
,
2
),
padding
=
(
5
,
20
),
act
=
paddle
.
activation
.
BRelu
())
for
i
in
xrange
(
num_stacks
-
1
):
conv
=
conv_bn_layer
(
input
=
conv
,
filter_size
=
(
11
,
21
),
num_channels_in
=
32
,
num_channels_out
=
32
,
stride
=
(
1
,
2
),
padding
=
(
5
,
10
),
act
=
paddle
.
activation
.
BRelu
())
output_num_channels
=
32
output_height
=
160
//
pow
(
2
,
num_stacks
)
+
1
return
conv
,
output_num_channels
,
output_height
def
infer_batch
(
self
,
infer_data
,
decode_method
,
beam_alpha
,
beam_beta
,
beam_size
,
cutoff_prob
,
vocab_list
,
language_model_path
,
num_processes
):
# define inferer
if
self
.
_inferer
==
None
:
self
.
_inferer
=
paddle
.
inference
.
Inference
(
output_layer
=
self
.
_log_probs
,
parameters
=
self
.
_parameters
)
# run inference
infer_results
=
self
.
_inferer
.
infer
(
input
=
infer_data
)
num_steps
=
len
(
infer_results
)
//
len
(
infer_data
)
probs_split
=
[
infer_results
[
i
*
num_steps
:(
i
+
1
)
*
num_steps
]
for
i
in
xrange
(
0
,
len
(
infer_data
))
]
# run decoder
results
=
[]
if
decode_method
==
"best_path"
:
# best path decode
for
i
,
probs
in
enumerate
(
probs_split
):
output_transcription
=
ctc_best_path_decoder
(
probs_seq
=
probs
,
vocabulary
=
data_generator
.
vocab_list
)
results
.
append
(
output_transcription
)
elif
decode_method
==
"beam_search"
:
# initialize external scorer
if
self
.
_ext_scorer
==
None
:
self
.
_ext_scorer
=
LmScorer
(
beam_alpha
,
beam_beta
,
language_model_path
)
self
.
_loaded_lm_path
=
language_model_path
else
:
self
.
_ext_scorer
.
reset_params
(
beam_alpha
,
beam_beta
)
assert
self
.
_loaded_lm_path
==
language_model_path
# beam search decode
beam_search_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
vocabulary
=
vocab_list
,
beam_size
=
beam_size
,
blank_id
=
len
(
vocab_list
),
num_processes
=
num_processes
,
ext_scoring_func
=
self
.
_ext_scorer
,
cutoff_prob
=
cutoff_prob
)
results
=
[
result
[
0
][
1
]
for
result
in
beam_search_results
]
else
:
raise
ValueError
(
"Decoding method [%s] is not supported."
%
decode_method
)
return
results
def
rnn_group
(
input
,
size
,
num_stacks
):
"""
RNN group with several stacking RNN layers.
"""
output
=
input
for
i
in
xrange
(
num_stacks
):
output
=
bidirectional_simple_rnn_bn_layer
(
name
=
str
(
i
),
input
=
output
,
size
=
size
,
act
=
paddle
.
activation
.
BRelu
())
return
output
def
_create_parameters
(
self
,
model_path
=
None
):
if
model_path
is
None
:
self
.
_parameters
=
paddle
.
parameters
.
create
(
self
.
_loss
)
else
:
self
.
_parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
model_path
))
def
deep_speech2
(
audio_data
,
text_data
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
256
,
is_inference
=
False
):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:param is_inference: False in the training mode, and True in the
inferene mode.
:type is_inference: bool
:return: If is_inference set False, return a ctc cost layer;
if is_inference set True, return a sequence layer of output
probability distribution.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output
,
conv_group_num_channels
,
conv_group_height
=
conv_group
(
input
=
audio_data
,
num_stacks
=
num_conv_layers
)
# convert data form convolution feature map to sequence of vectors
conv2seq
=
paddle
.
layer
.
block_expand
(
input
=
conv_group_output
,
num_channels
=
conv_group_num_channels
,
stride_x
=
1
,
stride_y
=
1
,
block_x
=
1
,
block_y
=
conv_group_height
)
# rnn group
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
rnn_size
,
num_stacks
=
num_rnn_layers
)
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
if
is_inference
:
# probability distribution with softmax
return
paddle
.
layer
.
mixed
(
input
=
paddle
.
layer
.
identity_projection
(
input
=
fc
),
act
=
paddle
.
activation
.
Softmax
())
else
:
# ctc cost
return
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
def
_create_network
(
self
,
vocab_size
,
num_conv_layers
,
num_rnn_layers
,
rnn_layer_size
):
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
type
=
paddle
.
data_type
.
dense_array
(
161
*
161
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
vocab_size
))
self
.
_log_probs
,
self
.
_loss
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
vocab_size
,
num_conv_layers
=
num_conv_layers
,
num_rnn_layers
=
num_rnn_layers
,
rnn_size
=
rnn_layer_size
)
train.py
浏览文件 @
8122dd9c
...
...
@@ -3,15 +3,11 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
sys
import
os
import
argparse
import
gzip
import
time
import
distutils.util
import
multiprocessing
import
paddle.v2
as
paddle
from
model
import
deep_speech2
from
model
import
DeepSpeech2Model
from
data_utils.data
import
DataGenerator
import
utils
...
...
@@ -23,6 +19,12 @@ parser.add_argument(
default
=
200
,
type
=
int
,
help
=
"Training pass number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_iterations_print"
,
default
=
100
,
type
=
int
,
help
=
"Number of iterations for every train cost printing. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
...
...
@@ -127,100 +129,47 @@ args = parser.parse_args()
def
train
():
"""DeepSpeech2 training."""
# initialize data generator
def
data_generator
():
return
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
args
.
augmentation_config
,
max_duration
=
args
.
max_duration
,
min_duration
=
args
.
min_duration
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
train_generator
=
data_generator
()
test_generator
=
data_generator
()
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
type
=
paddle
.
data_type
.
dense_array
(
161
*
161
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
train_generator
.
vocab_size
))
cost
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
train_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
False
)
# create/load parameters and optimizer
if
args
.
init_model_path
is
None
:
parameters
=
paddle
.
parameters
.
create
(
cost
)
else
:
if
not
os
.
path
.
isfile
(
args
.
init_model_path
):
raise
IOError
(
"Invalid model!"
)
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
args
.
init_model_path
))
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
args
.
adam_learning_rate
,
gradient_clipping_threshold
=
400
)
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
# prepare data reader
train_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
args
.
augmentation_config
,
max_duration
=
args
.
max_duration
,
min_duration
=
args
.
min_duration
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
dev_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
"{}"
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
train_batch_reader
=
train_generator
.
batch_reader_creator
(
manifest_path
=
args
.
train_manifest_path
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
args
.
trainer_count
,
sortagrad
=
args
.
use_sortagrad
if
args
.
init_model_path
is
None
else
False
,
shuffle_method
=
args
.
shuffle_method
)
test_batch_reader
=
test
_generator
.
batch_reader_creator
(
dev_batch_reader
=
dev
_generator
.
batch_reader_creator
(
manifest_path
=
args
.
dev_manifest_path
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
1
,
# must be 1, but will have errors.
sortagrad
=
False
,
shuffle_method
=
None
)
# create event handler
def
event_handler
(
event
):
global
start_time
,
cost_sum
,
cost_counter
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
cost_sum
+=
event
.
cost
cost_counter
+=
1
if
(
event
.
batch_id
+
1
)
%
100
==
0
:
print
(
"
\n
Pass: %d, Batch: %d, TrainCost: %f"
%
(
event
.
pass_id
,
event
.
batch_id
+
1
,
cost_sum
/
cost_counter
))
cost_sum
,
cost_counter
=
0.0
,
0
with
gzip
.
open
(
"checkpoints/params.latest.tar.gz"
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
BeginPass
):
start_time
=
time
.
time
()
cost_sum
,
cost_counter
=
0.0
,
0
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_batch_reader
,
feeding
=
test_generator
.
feeding
)
print
(
"
\n
------- Time: %d sec, Pass: %d, ValidationCost: %s"
%
(
time
.
time
()
-
start_time
,
event
.
pass_id
,
result
.
cost
))
with
gzip
.
open
(
"checkpoints/params.pass-%d.tar.gz"
%
event
.
pass_id
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
# run train
trainer
.
train
(
reader
=
train_batch_reader
,
event_handler
=
event_handler
,
ds2_model
=
DeepSpeech2Model
(
vocab_size
=
train_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
pretrained_model_path
=
args
.
init_model_path
)
ds2_model
.
train
(
train_batch_reader
=
train_batch_reader
,
dev_batch_reader
=
dev_batch_reader
,
feeding_dict
=
train_generator
.
feeding
,
learning_rate
=
args
.
adam_learning_rate
,
gradient_clipping
=
400
,
num_passes
=
args
.
num_passes
,
feeding
=
train_generator
.
feeding
)
num_iterations_print
=
args
.
num_iterations_print
)
def
main
():
...
...
tune.py
浏览文件 @
8122dd9c
...
...
@@ -3,14 +3,13 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
distutils.util
import
argparse
import
gzip
import
multiprocessing
import
paddle.v2
as
paddle
from
data_utils.data
import
DataGenerator
from
model
import
deep_speech2
from
decoder
import
*
from
lm.lm_scorer
import
LmScorer
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
import
utils
...
...
@@ -40,6 +39,11 @@ parser.add_argument(
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
multiprocessing
.
cpu_count
(),
...
...
@@ -62,10 +66,10 @@ parser.add_argument(
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--
decod
e_manifest_path"
,
"--
tun
e_manifest_path"
,
default
=
'datasets/manifest.test'
,
type
=
str
,
help
=
"Manifest path for
decod
ing. (default: %(default)s)"
)
help
=
"Manifest path for
tun
ing. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
...
...
@@ -127,96 +131,64 @@ args = parser.parse_args()
def
tune
():
"""Tune parameters alpha and beta on one minibatch."""
if
not
args
.
num_alphas
>=
0
:
raise
ValueError
(
"num_alphas must be non-negative!"
)
if
not
args
.
num_betas
>=
0
:
raise
ValueError
(
"num_betas must be non-negative!"
)
# initialize data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
# The size 161 * 161 is only an placeholder value and the real shape
# of input batch data will be induced during training.
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
type
=
paddle
.
data_type
.
dense_array
(
161
*
161
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
data_generator
.
vocab_size
))
output_probs
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
data_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
True
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
args
.
model_filepath
))
# prepare infer data
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decod
e_manifest_path
,
manifest_path
=
args
.
tun
e_manifest_path
,
batch_size
=
args
.
num_samples
,
sortagrad
=
False
,
shuffle_method
=
None
)
# get one batch data for tuning
infer_data
=
batch_reader
().
next
()
# run inference
infer_results
=
paddle
.
infer
(
output_layer
=
output_probs
,
parameters
=
parameters
,
input
=
infer_data
)
num_steps
=
len
(
infer_results
)
//
len
(
infer_data
)
probs_split
=
[
infer_results
[
i
*
num_steps
:(
i
+
1
)
*
num_steps
]
for
i
in
xrange
(
0
,
len
(
infer_data
))
tune_data
=
batch_reader
().
next
()
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
tune_data
]
ds2_model
=
DeepSpeech2Model
(
vocab_size
=
data_generator
.
vocab_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
pretrained_model_path
=
args
.
model_filepath
)
# create grid for search
cand_alphas
=
np
.
linspace
(
args
.
alpha_from
,
args
.
alpha_to
,
args
.
num_alphas
)
cand_betas
=
np
.
linspace
(
args
.
beta_from
,
args
.
beta_to
,
args
.
num_betas
)
params_grid
=
[(
alpha
,
beta
)
for
alpha
in
cand_alphas
for
beta
in
cand_betas
]
ext_scorer
=
LmScorer
(
args
.
alpha_from
,
args
.
beta_from
,
args
.
language_model_path
)
## tune parameters in loop
for
alpha
,
beta
in
params_grid
:
wer_sum
,
wer_counter
=
0
,
0
# reset scorer
ext_scorer
.
reset_params
(
alpha
,
beta
)
# beam search using multiple processes
beam_search_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
vocabulary
=
data_generator
.
vocab_list
,
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
tune_data
,
decode_method
=
'beam_search'
,
beam_alpha
=
alpha
,
beam_beta
=
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
blank_id
=
len
(
data_generator
.
vocab_list
),
num_processes
=
args
.
num_processes_beam_search
,
ext_scoring_func
=
ext_scorer
,
)
for
i
,
beam_search_result
in
enumerate
(
beam_search_results
):
target_transcription
=
''
.
join
([
data_generator
.
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]
])
wer_sum
+=
wer
(
target_transcription
,
beam_search_result
[
0
][
1
])
wer_counter
+=
1
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
language_model_path
,
num_processes
=
args
.
num_processes_beam_search
)
wer_sum
,
num_ins
=
0.0
,
0
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
wer_sum
+=
wer
(
target
,
result
)
num_ins
+=
1
print
(
"alpha = %f
\t
beta = %f
\t
WER = %f"
%
(
alpha
,
beta
,
wer_sum
/
wer_counter
))
(
alpha
,
beta
,
wer_sum
/
num_ins
))
def
main
():
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
utils
.
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
tune
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录