Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
7739b52e
M
models
项目概览
PaddlePaddle
/
models
1 年多 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7739b52e
编写于
5月 25, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add function docs.
上级
47b706cc
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
153 addition
and
19 deletion
+153
-19
deep_speech_2/audio_data_utils.py
deep_speech_2/audio_data_utils.py
+54
-2
deep_speech_2/infer.py
deep_speech_2/infer.py
+22
-7
deep_speech_2/librispeech.py
deep_speech_2/librispeech.py
+12
-3
deep_speech_2/model.py
deep_speech_2/model.py
+49
-0
deep_speech_2/train.py
deep_speech_2/train.py
+16
-7
未找到文件。
deep_speech_2/audio_data_utils.py
浏览文件 @
7739b52e
"""
Audio data preprocessing tools and reader creators.
"""
import
paddle.v2
as
paddle
import
logging
import
json
...
...
@@ -86,18 +89,24 @@ def vocabulary_from_file(vocabulary_path):
def
get_vocabulary_size
():
"""
Get vocabulary size.
"""
vocab_dict
,
_
=
vocabulary_from_file
(
ENGLISH_CHAR_VOCAB_FILEPATH
)
return
len
(
vocab_dict
)
def
get_vocabulary
():
"""
Get vocabulary.
"""
return
vocabulary_from_file
(
ENGLISH_CHAR_VOCAB_FILEPATH
)
def
parse_transcript
(
text
,
vocabulary
):
"""
Convert the transcript text string to list of token index integers.
.
"""
Convert the transcript text string to list of token index integers
.
"""
return
[
vocabulary
[
w
]
for
w
in
text
]
...
...
@@ -106,6 +115,28 @@ def reader_creator(manifest_path,
shuffle
=
False
,
max_duration
=
10.0
,
min_duration
=
0.0
):
"""
Audio data reader creator.
Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
tokenized transcription text.
:param manifest_path: Filepath for Manifest of audio clip files.
:type manifest_path: basestring
:param sort_by_duration: Sort the audio clips by duration if set True.
For SortaGrad.
:type sort_by_duration: bool
:param shuffle: Shuffle the audio clips if set True.
:type shuffle: bool
:param max_duration: Audio clips with duration (in seconds) greater than
this will be discarded.
:type max_duration: float
:param min_duration: Audio clips with duration (in seconds) smaller than
this will be discarded.
:type min_duration: float
:return: Data reader function.
:rtype: callable
"""
if
sort_by_duration
and
shuffle
:
sort_by_duration
=
False
logger
.
warn
(
"When shuffle set to true, "
...
...
@@ -138,6 +169,27 @@ def reader_creator(manifest_path,
def
padding_batch_reader
(
batch_reader
,
padding
=
[
-
1
,
-
1
],
flatten
=
True
):
"""
Padding for batches. Return a batch reader.
Each instance in a batch will be padded to be of a same target shape.
The target shape is the largest shape among all the batch instances and
'padding' argument. Therefore, if padding is set [-1, -1], instance will be
padded to have the same shape just within each batch and the shape will
be different across batches; if padding is set
[VERY_LARGE_NUM, VERY_LARGE_NUM], instances in all batches will be padded to
have the same shape of [VERY_LARGE_NUM, VERY_LARGE_NUM].
:param batch_reader: Input batch reader.
:type batch_reader: callable
:param padding: Padding pattern. Details please refer to the above.
:type padding: list
:param flatten: Flatten the tensor to be one dimension.
:type flatten: bool
:return: Batch reader function.
:rtype: callable
"""
def
padding_batch
(
batch
):
new_batch
=
[]
# get target shape within batch
...
...
deep_speech_2/infer.py
浏览文件 @
7739b52e
"""
Inference for a simplifed version of Baidu DeepSpeech2 model.
"""
import
paddle.v2
as
paddle
import
audio_data_utils
from
itertools
import
groupby
import
argparse
from
model
import
deep_speech2
import
gzip
from
itertools
import
groupby
import
audio_data_utils
from
model
import
deep_speech2
parser
=
argparse
.
ArgumentParser
(
description
=
'Simpled version of DeepSpeech2 inference.'
)
description
=
'Simpl
ifi
ed version of DeepSpeech2 inference.'
)
parser
.
add_argument
(
"--num_samples"
,
default
=
10
,
type
=
int
,
help
=
"Number of inference samples."
)
"--num_samples"
,
default
=
10
,
type
=
int
,
help
=
"Number of samples for inference."
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number."
)
parser
.
add_argument
(
...
...
@@ -21,13 +28,21 @@ args = parser.parse_args()
def
remove_duplicate_and_blank
(
id_list
,
blank_id
):
"""
Postprocessing for max-ctc-decoder.
- remove consecutive duplicate tokens.
- remove blanks.
"""
# remove consecutive duplicate tokens
id_list
=
[
x
[
0
]
for
x
in
groupby
(
id_list
)]
# remove blank
# remove blank
s
return
[
id
for
id
in
id_list
if
id
!=
blank_id
]
def
max_infer
():
"""
Max-ctc-decoding for DeepSpeech2.
"""
# create network config
_
,
vocab_list
=
audio_data_utils
.
get_vocabulary
()
dict_size
=
len
(
vocab_list
)
...
...
@@ -64,7 +79,7 @@ def max_infer():
padding
=
[
-
1
,
1000
])
infer_data
=
test_batch_reader
().
next
()
# run
inference
# run
max-ctc-decoding
max_id_results
=
paddle
.
infer
(
output_layer
=
max_id
,
parameters
=
parameters
,
...
...
deep_speech_2/librispeech.py
浏览文件 @
7739b52e
"""
Download, unpack and create manifest for Librespeech dataset.
Manifest is a json file with each line containing one audio clip filepath,
its transcription text string, and its duration. It servers as a unified
interfance to organize different data sets.
"""
import
paddle.v2
as
paddle
import
os
import
wget
...
...
@@ -88,9 +96,10 @@ def main():
url
=
URL_DEV
,
target_dir
=
os
.
path
.
join
(
args
.
target_dir
),
manifest_path
=
args
.
manifest
+
".dev"
)
#prepare_dataset(url=URL_TRAIN,
#target_dir=os.path.join(args.target_dir),
#manifest_path=args.manifest + ".train")
prepare_dataset
(
url
=
URL_TRAIN
,
target_dir
=
os
.
path
.
join
(
args
.
target_dir
),
manifest_path
=
args
.
manifest
+
".train"
)
if
__name__
==
'__main__'
:
...
...
deep_speech_2/model.py
浏览文件 @
7739b52e
"""
A simplifed version of Baidu DeepSpeech2 model.
"""
import
paddle.v2
as
paddle
#TODO: add bidirectional rnn.
def
conv_bn_layer
(
input
,
filter_size
,
num_channels_in
,
num_channels_out
,
stride
,
padding
,
act
):
"""
Convolution layer with batch normalization.
"""
conv_layer
=
paddle
.
layer
.
img_conv
(
input
=
input
,
filter_size
=
filter_size
,
...
...
@@ -16,6 +25,15 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
def
bidirectonal_simple_rnn_bn_layer
(
name
,
input
,
size
,
act
):
"""
Bidirectonal simple rnn layer with batch normalization.
The batch normalization is only performed on input-state projection
(sequence-wise normalization).
Question: does mean and variance statistics computed over the whole sequence
or just on each individual time steps?
"""
def
__simple_rnn_step__
(
input
):
last_state
=
paddle
.
layer
.
memory
(
name
=
name
+
"_state"
,
size
=
size
)
input_fc
=
paddle
.
layer
.
fc
(
...
...
@@ -23,6 +41,7 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
# batch norm is only performed on input-state projection
input_fc_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_fc
,
act
=
paddle
.
activation
.
Linear
())
state_fc
=
paddle
.
layer
.
fc
(
...
...
@@ -47,6 +66,9 @@ def bidirectonal_simple_rnn_bn_layer(name, input, size, act):
def
conv_group
(
input
,
num_stacks
):
"""
Convolution group with several stacking convolution layers.
"""
conv
=
conv_bn_layer
(
input
=
input
,
filter_size
=
(
11
,
41
),
...
...
@@ -68,6 +90,9 @@ def conv_group(input, num_stacks):
def
rnn_group
(
input
,
size
,
num_stacks
):
"""
RNN group with several stacking RNN layers.
"""
output
=
input
for
i
in
xrange
(
num_stacks
):
output
=
bidirectonal_simple_rnn_bn_layer
(
...
...
@@ -81,7 +106,27 @@ def deep_speech2(audio_data,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
256
):
"""
The whole DeepSpeech2 model structure (a simplified version).
:param audio_data: Audio spectrogram data layer.
:type audio_data: LayerOutput
:param text_data: Transcription text data layer.
:type text_data: LayerOutput
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: Tuple of the cost layer and the max_id decoder layer.
:rtype: tuple of LayerOutput
"""
# convolution group
conv_group_output
=
conv_group
(
input
=
audio_data
,
num_stacks
=
num_conv_layers
)
# convert data form convolution feature map to sequence of vectors
conv2seq
=
paddle
.
layer
.
block_expand
(
input
=
conv_group_output
,
num_channels
=
32
,
...
...
@@ -89,18 +134,22 @@ def deep_speech2(audio_data,
stride_y
=
1
,
block_x
=
1
,
block_y
=
21
)
# rnn group
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
rnn_size
,
num_stacks
=
num_rnn_layers
)
# output token distribution
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
# ctc cost
cost
=
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
# max decoder
max_id
=
paddle
.
layer
.
max_id
(
input
=
fc
)
return
cost
,
max_id
deep_speech_2/train.py
浏览文件 @
7739b52e
"""
Trainer for a simplifed version of Baidu DeepSpeech2 model.
"""
import
paddle.v2
as
paddle
import
audio_data_utils
import
argparse
from
model
import
deep_speech2
import
gzip
import
sys
from
model
import
deep_speech2
import
audio_data_utils
#TODO: add WER metric
parser
=
argparse
.
ArgumentParser
(
description
=
'Simpled version of DeepSpeech2 trainer.'
)
description
=
'Simpl
ifi
ed version of DeepSpeech2 trainer.'
)
parser
.
add_argument
(
"--batch_size"
,
default
=
512
,
type
=
int
,
help
=
"Minibatch size."
)
parser
.
add_argument
(
"--trainer"
,
default
=
1
,
type
=
int
,
help
=
"Trainer number."
)
parser
.
add_argument
(
"--num_passes"
,
default
=
20
,
type
=
int
,
help
=
"Training pass number."
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number."
)
"--num_conv_layers"
,
default
=
3
,
type
=
int
,
help
=
"Convolution layer number."
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number."
)
"--num_rnn_layers"
,
default
=
5
,
type
=
int
,
help
=
"RNN layer number."
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
256
,
type
=
int
,
help
=
"RNN layer cell number."
)
parser
.
add_argument
(
...
...
@@ -25,6 +32,9 @@ args = parser.parse_args()
def
train
():
"""
DeepSpeech2 training.
"""
# create network config
dict_size
=
audio_data_utils
.
get_vocabulary_size
()
audio_data
=
paddle
.
layer
.
data
(
...
...
@@ -89,8 +99,7 @@ def train():
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_batch_reader
,
feeding
=
feeding
)
print
"Pass: %d, TestCost: %f, %s"
%
(
event
.
pass_id
,
event
.
cost
,
result
.
metrics
)
print
"Pass: %d, TestMetric: %s"
%
(
event
.
pass_id
,
result
.
metrics
)
with
gzip
.
open
(
"params.tar.gz"
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录