Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
0babc5c4
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0babc5c4
编写于
5月 25, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add function docs.
上级
70a343a4
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
153 addition
and
19 deletion
+153
-19
audio_data_utils.py
audio_data_utils.py
+54
-2
infer.py
infer.py
+22
-7
librispeech.py
librispeech.py
+12
-3
model.py
model.py
+49
-0
train.py
train.py
+16
-7
未找到文件。
audio_data_utils.py
浏览文件 @
0babc5c4
"""
Audio data preprocessing tools and reader creators.
"""
import
paddle.v2
as
paddle
import
logging
import
json
...
...
@@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path):
def
get_vocabulary_size
():
"""
Get vocabulary size.
"""
vocab_dict
,
_
=
vocabulary_from_file
(
ENGLISH_CHAR_VOCAB_FILEPATH
)
return
len
(
vocab_dict
)
def
get_vocabulary
():
"""
Get vocabulary.
"""
return
vocabulary_from_file
(
ENGLISH_CHAR_VOCAB_FILEPATH
)
def
parse_transcript
(
text
,
vocabulary
):
"""
Convert the transcript text string to list of token index integers.
.
"""
Convert the transcript text string to list of token index integers
.
"""
return
[
vocabulary
[
w
]
for
w
in
text
]
...
...
@@ -106,6 +115,28 @@ def reader_creator(manifest_path,
shuffle
=
False
,
max_duration
=
10.0
,
min_duration
=
0.0
):
"""
Audio data reader creator.
Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
tokenized transcription text.
:param manifest_path: Filepath for Manifest of audio clip files.
:type manifest_path: basestring
:param sort_by_duration: Sort the audio clips by duration if set True.
For SortaGrad.
:type sort_by_duration: bool
:param shuffle: Shuffle the audio clips if set True.
:type shuffle: bool
:param max_duration: Audio clips with duration (in seconds) greater than
this will be discarded.
:type max_duration: float
:param min_duration: Audio clips with duration (in seconds) smaller than
this will be discarded.
:type min_duration: float
:return: Data reader function.
:rtype: callable
"""
if
sort_by_duration
and
shuffle
:
sort_by_duration
=
False
logger
.
warn
(
"When shuffle set to true, "
...
...
@@ -138,6 +169,27 @@ def reader_creator(manifest_path,
def
padding_batch_reader
(
batch_reader
,
padding
=
[
-
1
,
-
1
],
flatten
=
True
):
"""
Padding for batches. Return a batch reader.
Each instance in a batch will be padded to be of a same target shape.
The target shape is the largest shape among all the batch instances and
'padding' argument. Therefore, if padding is set [-1, -1], instance will be
padded to have the same shape just within each batch and the shape will
be different across batches; if padding is set
[VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to
have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM].
:param batch_reader: Input batch reader.
:type batch_reader: callable
:param padding: Padding pattern. Details please refer to the above.
:type padding: list
:param flatten: Flatten the tensor to be one dimension.
:type flatten: bool
:return: Batch reader function.
:rtype: callable
"""
def
padding_batch
(
batch
):
new_batch
=
[]
# get target shape within batch
...
...
infer.py
浏览文件 @
0babc5c4
"""
Inference for a simplifed version of Baidu DeepSpeech2 model.
"""
import
paddle.v2
as
paddle
import
audio_data_utils
from
itertools
import
groupby
import
argparse
from
model
import
deep_speech2
import
gzip
from
itertools
import
groupby
import
audio_data_utils
from
model
import
deep_speech2
parser
=
argparse
.
ArgumentParser
(
description
=
'Simpled version of DeepSpeech2 inference.'
)
description
=
'Simpl
ifi
ed version of DeepSpeech2 inference.'
)
parser
.
add_argument
(
"--num_samples"
,
default
=
10
,
type
=
int
,
help
=
"Number of inference samples."
)
"--num_samples"
,
default
=
10
,
type
=
int
,
help
=
"Number of samples for inference."
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number."
)
parser
.
add_argument
(
...
...
@@ -21,13 +28,21 @@ args = parser.parse_args()
def
remove_duplicate_and_blank
(
id_list
,
blank_id
):
"""
Postprocessing for max-ctc-decoder.
- remove consecutive duplicate tokens.
- remove blanks.
"""
# remove consecutive duplicate tokens
id_list
=
[
x
[
0
]
for
x
in
groupby
(
id_list
)]
# remove blank
# remove blank
s
return
[
id
for
id
in
id_list
if
id
!=
blank_id
]
def
max_infer
():
"""
Max-ctc-decoding for DeepSpeech2.
"""
# create network config
_
,
vocab_list
=
audio_data_utils
.
get_vocabulary
()
dict_size
=
len
(
vocab_list
)
...
...
@@ -64,7 +79,7 @@ def max_infer():
padding
=
[
-
1
,
1000
])
infer_data
=
test_batch_reader
().
next
()
# run
inference
# run
max-ctc-decoding
max_id_results
=
paddle
.
infer
(
output_layer
=
max_id
,
parameters
=
parameters
,
...
...
librispeech.py
浏览文件 @
0babc5c4
"""
Download, unpack and create manifest for Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
"""
import
paddle.v2
as
paddle
import
os
import
wget
...
...
@@ -88,9 +96,10 @@ def main():
url
=
URL_DEV
,
target_dir
=
os
.
path
.
join
(
args
.
target_dir
),
manifest_path
=
args
.
manifest
+
".dev"
)
#prepare_dataset(url=URL_TRAIN,
#target_dir=os.path.join(args.target_dir),
#manifest_path=args.manifest + ".train")
prepare_dataset
(
url
=
URL_TRAIN
,
target_dir
=
os
.
path
.
join
(
args
.
target_dir
),
manifest_path
=
args
.
manifest
+
".train"
)
if
__name__
==
'__main__'
:
...
...
model.py
浏览文件 @
0babc5c4
"""
A simplifed version of Baidu DeepSpeech2 model.
"""
import
paddle.v2
as
paddle
#TODO: add bidirectional rnn.
def
conv_bn_layer
(
input
,
filter_size
,
num_channels_in
,
num_channels_out
,
stride
,
padding
,
act
):
"""
Convolution layer with batch normalization.
"""
conv_layer
=
paddle
.
layer
.
img_conv
(
input
=
input
,
filter_size
=
filter_size
,
...
...
@@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
def
bidirectonal_simple_rnn_bn_layer
(
name
,
input
,
size
,
act
):
"""
Bidirectonal simple rnn layer with batch normalization.
The batch normalization is only performed on input-state projection
(sequence-wise normalization).
Question: does mean and variance statistics computed over the whole sequence
or just on each individual time steps?
"""
def
__simple_rnn_step__
(
input
):
last_state
=
paddle
.
layer
.
memory
(
name
=
name
+
"_state"
,
size
=
size
)
input_fc
=
paddle
.
layer
.
fc
(
...
...
@@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
# batch norm is only performed on input-state projection
input_fc_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_fc
,
act
=
paddle
.
activation
.
Linear
())
state_fc
=
paddle
.
layer
.
fc
(
...
...
@@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
def
conv_group
(
input
,
num_stacks
):
"""
Convolution group with several stacking convolution layers.
"""
conv
=
conv_bn_layer
(
input
=
input
,
filter_size
=
(
11
,
41
),
...
...
@@ -68,6 +90,9 @@ def conv_group(input, num_stacks):
def
rnn_group
(
input
,
size
,
num_stacks
):
"""
RNN group with several stacking RNN layers.
"""
output
=
input
for
i
in
xrange
(
num_stacks
):
output
=
bidirectonal_simple_rnn_bn_layer
(
...
...
@@ -81,7 +106,27 @@ def deep_speech2(audio_data,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
256
):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: Tuple of the cost layer and the max_id decoder layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output
=
conv_group
(
input
=
audio_data
,
num_stacks
=
num_conv_layers
)
# convert data form convolution feature map to sequence of vectors
conv2seq
=
paddle
.
layer
.
block_expand
(
input
=
conv_group_output
,
num_channels
=
32
,
...
...
@@ -89,18 +134,22 @@ def deep_speech2(audio_data,
stride_y
=
1
,
block_x
=
1
,
block_y
=
21
)
# rnn group
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
rnn_size
,
num_stacks
=
num_rnn_layers
)
# output token distribution
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
# ctc cost
cost
=
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
# max decoder
max_id
=
paddle
.
layer
.
max_id
(
input
=
fc
)
return
cost
,
max_id
train.py
浏览文件 @
0babc5c4
"""
Trainer for a simplifed version of Baidu DeepSpeech2 model.
"""
import
paddle.v2
as
paddle
import
audio_data_utils
import
argparse
from
model
import
deep_speech2
import
gzip
import
sys
from
model
import
deep_speech2
import
audio_data_utils
#TODO: add WER metric
parser
=
argparse
.
ArgumentParser
(
description
=
'Simpled version of DeepSpeech2 trainer.'
)
description
=
'Simpl
ifi
ed version of DeepSpeech2 trainer.'
)
parser
.
add_argument
(
"--batch_size"
,
default
=
512
,
type
=
int
,
help
=
"Minibatch size."
)
parser
.
add_argument
(
"--trainer"
,
default
=
1
,
type
=
int
,
help
=
"Trainer number."
)
parser
.
add_argument
(
"--num_passes"
,
default
=
20
,
type
=
int
,
help
=
"Training pass number."
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number."
)
"--num_conv_layers"
,
default
=
3
,
type
=
int
,
help
=
"Convolution layer number."
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number."
)
"--num_rnn_layers"
,
default
=
5
,
type
=
int
,
help
=
"RNN layer number."
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
256
,
type
=
int
,
help
=
"RNN layer cell number."
)
parser
.
add_argument
(
...
...
@@ -25,6 +32,9 @@ args = parser.parse_args()
def
train
():
"""
DeepSpeech2 training.
"""
# create network config
dict_size
=
audio_data_utils
.
get_vocabulary_size
()
audio_data
=
paddle
.
layer
.
data
(
...
...
@@ -89,8 +99,7 @@ def train():
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_batch_reader
,
feeding
=
feeding
)
print
"Pass: %d, TestCost: %f, %s"
%
(
event
.
pass_id
,
event
.
cost
,
result
.
metrics
)
print
"Pass: %d, TestMetric: %s"
%
(
event
.
pass_id
,
result
.
metrics
)
with
gzip
.
open
(
"params.tar.gz"
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录