Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
70a343a4
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
接近 1 年 前同步成功
通知
205
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
70a343a4
编写于
5月 25, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add infererence and add SortaGrad for only first pass.
上级
3fc94427
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
248 addition
and
114 deletion
+248
-114
README.md
README.md
+2
-0
audio_data_utils.py
audio_data_utils.py
+4
-0
infer.py
infer.py
+94
-0
librispeech.py
librispeech.py
+1
-1
model.py
model.py
+106
-0
requirements.sh
requirements.sh
+1
-1
train.py
train.py
+40
-112
未找到文件。
README.md
浏览文件 @
70a343a4
...
...
@@ -5,3 +5,5 @@ sh requirements.sh
python librispeech.py
python train.py
```
Please add warp-ctc library path (usually $PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib) to LD_LIBRARY_PATH.
audio_data_utils.py
浏览文件 @
70a343a4
...
...
@@ -90,6 +90,10 @@ def get_vocabulary_size():
return
len
(
vocab_dict
)
def
get_vocabulary
():
return
vocabulary_from_file
(
ENGLISH_CHAR_VOCAB_FILEPATH
)
def
parse_transcript
(
text
,
vocabulary
):
"""
Convert the transcript text string to list of token index integers..
...
...
infer.py
0 → 100644
浏览文件 @
70a343a4
import
paddle.v2
as
paddle
import
audio_data_utils
import
argparse
from
model
import
deep_speech2
import
gzip
from
itertools
import
groupby
parser
=
argparse
.
ArgumentParser
(
description
=
'Simpled version of DeepSpeech2 inference.'
)
parser
.
add_argument
(
"--num_samples"
,
default
=
10
,
type
=
int
,
help
=
"Number of inference samples."
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number."
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number."
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
512
,
type
=
int
,
help
=
"RNN layer cell number."
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
bool
,
help
=
"Use gpu or not."
)
args
=
parser
.
parse_args
()
def
remove_duplicate_and_blank
(
id_list
,
blank_id
):
# remove consecutive duplicate tokens
id_list
=
[
x
[
0
]
for
x
in
groupby
(
id_list
)]
# remove blank
return
[
id
for
id
in
id_list
if
id
!=
blank_id
]
def
max_infer
():
# create network config
_
,
vocab_list
=
audio_data_utils
.
get_vocabulary
()
dict_size
=
len
(
vocab_list
)
audio_data
=
paddle
.
layer
.
data
(
name
=
"audio_spectrogram"
,
height
=
161
,
width
=
1000
,
type
=
paddle
.
data_type
.
dense_vector
(
161000
))
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
_
,
max_id
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
dict_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
"params.tar.gz"
))
# prepare infer data
feeding
=
{
"audio_spectrogram"
:
0
,
"transcript_text"
:
1
,
}
test_batch_reader
=
audio_data_utils
.
padding_batch_reader
(
paddle
.
batch
(
audio_data_utils
.
reader_creator
(
manifest_path
=
"./libri.manifest.test"
,
sort_by_duration
=
False
),
batch_size
=
args
.
num_samples
),
padding
=
[
-
1
,
1000
])
infer_data
=
test_batch_reader
().
next
()
# run inference
max_id_results
=
paddle
.
infer
(
output_layer
=
max_id
,
parameters
=
parameters
,
input
=
infer_data
,
field
=
[
'id'
])
# postprocess
instance_length
=
len
(
max_id_results
)
/
args
.
num_samples
instance_list
=
[
max_id_results
[
i
:
i
+
instance_length
]
for
i
in
xrange
(
0
,
args
.
num_samples
)
]
for
i
,
instance
in
enumerate
(
instance_list
):
id_list
=
remove_duplicate_and_blank
(
instance
,
dict_size
)
output_transcript
=
''
.
join
([
vocab_list
[
id
]
for
id
in
id_list
])
target_transcript
=
''
.
join
([
vocab_list
[
id
]
for
id
in
infer_data
[
i
][
1
]])
print
(
"Target Transcript: %s
\n
Output Transcript: %s
\n
"
%
(
target_transcript
,
output_transcript
))
def
main
():
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
max_infer
()
if
__name__
==
'__main__'
:
main
()
librispeech.py
浏览文件 @
70a343a4
...
...
@@ -23,7 +23,7 @@ parser.add_argument(
"--manifest"
,
default
=
"./libri.manifest"
,
type
=
str
,
help
=
"Filepath prefix
of
output manifests."
)
help
=
"Filepath prefix
for
output manifests."
)
args
=
parser
.
parse_args
()
...
...
model.py
0 → 100644
浏览文件 @
70a343a4
import
paddle.v2
as
paddle
def
conv_bn_layer
(
input
,
filter_size
,
num_channels_in
,
num_channels_out
,
stride
,
padding
,
act
):
conv_layer
=
paddle
.
layer
.
img_conv
(
input
=
input
,
filter_size
=
filter_size
,
num_channels
=
num_channels_in
,
num_filters
=
num_channels_out
,
stride
=
stride
,
padding
=
padding
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
return
paddle
.
layer
.
batch_norm
(
input
=
conv_layer
,
act
=
act
)
def
bidirectonal_simple_rnn_bn_layer
(
name
,
input
,
size
,
act
):
def
__simple_rnn_step__
(
input
):
last_state
=
paddle
.
layer
.
memory
(
name
=
name
+
"_state"
,
size
=
size
)
input_fc
=
paddle
.
layer
.
fc
(
input
=
input
,
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
input_fc_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_fc
,
act
=
paddle
.
activation
.
Linear
())
state_fc
=
paddle
.
layer
.
fc
(
input
=
last_state
,
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
return
paddle
.
layer
.
addto
(
name
=
name
+
"_state"
,
input
=
[
input_fc_bn
,
state_fc
],
act
=
act
)
forward
=
paddle
.
layer
.
recurrent_group
(
step
=
__simple_rnn_step__
,
input
=
input
)
return
forward
# argument reverse is not exposed in V2 recurrent_group
#backward = paddle.layer.recurrent_group(
#step=__simple_rnn_step__,
#input=input,
#reverse=True)
#return paddle.layer.concat(input=[forward, backward])
def
conv_group
(
input
,
num_stacks
):
conv
=
conv_bn_layer
(
input
=
input
,
filter_size
=
(
11
,
41
),
num_channels_in
=
1
,
num_channels_out
=
32
,
stride
=
(
3
,
2
),
padding
=
(
5
,
20
),
act
=
paddle
.
activation
.
BRelu
())
for
i
in
xrange
(
num_stacks
-
1
):
conv
=
conv_bn_layer
(
input
=
conv
,
filter_size
=
(
11
,
21
),
num_channels_in
=
32
,
num_channels_out
=
32
,
stride
=
(
1
,
2
),
padding
=
(
5
,
10
),
act
=
paddle
.
activation
.
BRelu
())
return
conv
def
rnn_group
(
input
,
size
,
num_stacks
):
output
=
input
for
i
in
xrange
(
num_stacks
):
output
=
bidirectonal_simple_rnn_bn_layer
(
name
=
str
(
i
),
input
=
output
,
size
=
size
,
act
=
paddle
.
activation
.
BRelu
())
return
output
def
deep_speech2
(
audio_data
,
text_data
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
256
):
conv_group_output
=
conv_group
(
input
=
audio_data
,
num_stacks
=
num_conv_layers
)
conv2seq
=
paddle
.
layer
.
block_expand
(
input
=
conv_group_output
,
num_channels
=
32
,
stride_x
=
1
,
stride_y
=
1
,
block_x
=
1
,
block_y
=
21
)
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
rnn_size
,
num_stacks
=
num_rnn_layers
)
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
cost
=
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
max_id
=
paddle
.
layer
.
max_id
(
input
=
fc
)
return
cost
,
max_id
requirements.sh
浏览文件 @
70a343a4
pip
install
wget
pip
install
soundfile
# For
Linux
only
# For
Ubuntu
only
apt-get
install
libsndfile1
train.py
浏览文件 @
70a343a4
import
paddle.v2
as
paddle
import
audio_data_utils
import
argparse
from
model
import
deep_speech2
import
gzip
parser
=
argparse
.
ArgumentParser
(
description
=
'Simpled version of DeepSpeech2 trainer.'
)
...
...
@@ -9,114 +11,19 @@ parser.add_argument(
parser
.
add_argument
(
"--trainer"
,
default
=
1
,
type
=
int
,
help
=
"Trainer number."
)
parser
.
add_argument
(
"--num_passes"
,
default
=
20
,
type
=
int
,
help
=
"Training pass number."
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number."
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number."
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
256
,
type
=
int
,
help
=
"RNN layer cell number."
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
bool
,
help
=
"Use gpu or not."
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number."
)
args
=
parser
.
parse_args
()
def
conv_bn_layer
(
input
,
filter_size
,
num_channels_in
,
num_channels_out
,
stride
,
padding
,
act
):
conv_layer
=
paddle
.
layer
.
img_conv
(
input
=
input
,
filter_size
=
filter_size
,
num_channels
=
num_channels_in
,
num_filters
=
num_channels_out
,
stride
=
stride
,
padding
=
padding
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
return
paddle
.
layer
.
batch_norm
(
input
=
conv_layer
,
act
=
act
)
def
bidirectonal_simple_rnn_bn_layer
(
name
,
input
,
size
,
act
):
def
__simple_rnn_step__
(
input
):
last_state
=
paddle
.
layer
.
memory
(
name
=
name
+
"_state"
,
size
=
size
)
input_fc
=
paddle
.
layer
.
fc
(
input
=
input
,
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
input_fc_bn
=
paddle
.
layer
.
batch_norm
(
input
=
input_fc
,
act
=
paddle
.
activation
.
Linear
())
state_fc
=
paddle
.
layer
.
fc
(
input
=
last_state
,
size
=
size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
return
paddle
.
layer
.
addto
(
name
=
name
+
"_state"
,
input
=
[
input_fc_bn
,
state_fc
],
act
=
act
)
forward
=
paddle
.
layer
.
recurrent_group
(
step
=
__simple_rnn_step__
,
input
=
input
)
return
forward
# argument reverse is not exposed in V2 recurrent_group
#backward = paddle.layer.recurrent_group(
#step=__simple_rnn_step__,
#input=input,
#reverse=True)
#return paddle.layer.concat(input=[forward, backward])
def
conv_group
(
input
):
conv1
=
conv_bn_layer
(
input
=
input
,
filter_size
=
(
11
,
41
),
num_channels_in
=
1
,
num_channels_out
=
32
,
stride
=
(
3
,
2
),
padding
=
(
5
,
20
),
act
=
paddle
.
activation
.
BRelu
())
conv2
=
conv_bn_layer
(
input
=
conv1
,
filter_size
=
(
11
,
21
),
num_channels_in
=
32
,
num_channels_out
=
32
,
stride
=
(
1
,
2
),
padding
=
(
5
,
10
),
act
=
paddle
.
activation
.
BRelu
())
conv3
=
conv_bn_layer
(
input
=
conv2
,
filter_size
=
(
11
,
21
),
num_channels_in
=
32
,
num_channels_out
=
32
,
stride
=
(
1
,
2
),
padding
=
(
5
,
10
),
act
=
paddle
.
activation
.
BRelu
())
return
conv3
def
rnn_group
(
input
,
size
,
num_stacks
):
output
=
input
for
i
in
xrange
(
num_stacks
):
output
=
bidirectonal_simple_rnn_bn_layer
(
name
=
str
(
i
),
input
=
output
,
size
=
size
,
act
=
paddle
.
activation
.
BRelu
())
return
output
def
deep_speech2
(
audio_data
,
text_data
,
dict_size
):
conv_group_output
=
conv_group
(
input
=
audio_data
)
conv2seq
=
paddle
.
layer
.
block_expand
(
input
=
conv_group_output
,
num_channels
=
32
,
stride_x
=
1
,
stride_y
=
1
,
block_x
=
1
,
block_y
=
21
)
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
256
,
num_stacks
=
5
)
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
cost
=
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
return
cost
def
train
():
# create network config
dict_size
=
audio_data_utils
.
get_vocabulary_size
()
...
...
@@ -128,7 +35,13 @@ def train():
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
cost
=
deep_speech2
(
audio_data
,
text_data
,
dict_size
)
cost
,
_
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
dict_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
)
# create parameters and optimizer
parameters
=
paddle
.
parameters
.
create
(
cost
)
...
...
@@ -138,21 +51,30 @@ def train():
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
return
# create data readers
feeding
=
{
"audio_spectrogram"
:
0
,
"transcript_text"
:
1
,
}
train_batch_reader
=
audio_data_utils
.
padding_batch_reader
(
train_batch_reader
_with_sortagrad
=
audio_data_utils
.
padding_batch_reader
(
paddle
.
batch
(
audio_data_utils
.
reader_creator
(
"./libri.manifest.dev"
),
audio_data_utils
.
reader_creator
(
manifest_path
=
"./libri.manifest.dev"
,
sort_by_duration
=
True
),
batch_size
=
args
.
batch_size
//
args
.
trainer
),
padding
=
[
-
1
,
1000
])
train_batch_reader_without_sortagrad
=
audio_data_utils
.
padding_batch_reader
(
paddle
.
batch
(
audio_data_utils
.
reader_creator
(
manifest_path
=
"./libri.manifest.dev"
,
sort_by_duration
=
False
,
shuffle
=
True
),
batch_size
=
args
.
batch_size
//
args
.
trainer
),
padding
=
[
-
1
,
1000
])
test_batch_reader
=
audio_data_utils
.
padding_batch_reader
(
paddle
.
batch
(
audio_data_utils
.
reader_creator
(
"./libri.manifest.test"
),
audio_data_utils
.
reader_creator
(
manifest_path
=
"./libri.manifest.test"
,
sort_by_duration
=
False
),
batch_size
=
args
.
batch_size
//
args
.
trainer
),
padding
=
[
-
1
,
1000
])
...
...
@@ -174,13 +96,19 @@ def train():
# run train
trainer
.
train
(
reader
=
train_batch_reader
,
reader
=
train_batch_reader_with_sortagrad
,
event_handler
=
event_handler
,
num_passes
=
1
,
feeding
=
feeding
)
trainer
.
train
(
reader
=
train_batch_reader_without_sortagrad
,
event_handler
=
event_handler
,
num_passes
=
10
,
num_passes
=
self
.
num_passes
-
1
,
feeding
=
feeding
)
def
main
():
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
train
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录