Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
2a834865
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2a834865
编写于
6月 03, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor decoder interfaces and add ./data directory.
上级
8313895e
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
130 addition
and
63 deletion
+130
-63
README.md
README.md
+2
-0
data/eng_vocab.txt
data/eng_vocab.txt
+0
-0
data/librispeech.py
data/librispeech.py
+0
-0
decoder.py
decoder.py
+60
-0
infer.py
infer.py
+31
-38
model.py
model.py
+21
-13
train.py
train.py
+16
-12
未找到文件。
README.md
浏览文件 @
2a834865
...
...
@@ -16,7 +16,9 @@ For some machines, we also need to install libsndfile1. Details to be added.
### Preparing Dataset(s)
```
cd data
python librispeech.py
cd ..
```
More help for arguments:
...
...
eng_vocab.txt
→
data/
eng_vocab.txt
浏览文件 @
2a834865
文件已移动
librispeech.py
→
data/
librispeech.py
浏览文件 @
2a834865
文件已移动
decoder.py
0 → 100755
浏览文件 @
2a834865
"""
CTC-like decoder utilitis.
"""
from
itertools
import
groupby
import
numpy
as
np
def
ctc_best_path_decode
(
probs_seq
,
vocabulary
):
"""
Best path decoding, also called argmax decoding or greedy decoding.
Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks.
:param probs_seq: 2-D list of probabilities over the vocabulary for each
character. Each element is a list of float probabilities
for one character.
:type probs_seq: list
:param vocabulary: Vocabulary list.
:type vocabulary: list
:return: Decoding result string.
:rtype: baseline
"""
# dimension verification
for
probs
in
probs_seq
:
if
not
len
(
probs
)
==
len
(
vocabulary
)
+
1
:
raise
ValueError
(
"probs_seq dimension mismatchedd with vocabulary"
)
# argmax to get the best index for each time step
max_index_list
=
list
(
np
.
array
(
probs_seq
).
argmax
(
axis
=
1
))
# remove consecutive duplicate indexes
index_list
=
[
index_group
[
0
]
for
index_group
in
groupby
(
max_index_list
)]
# remove blank indexes
blank_index
=
len
(
vocabulary
)
index_list
=
[
index
for
index
in
index_list
if
index
!=
blank_index
]
# convert index list to string
return
''
.
join
([
vocabulary
[
index
]
for
index
in
index_list
])
def
ctc_decode
(
probs_seq
,
vocabulary
,
method
):
"""
CTC-like sequence decoding from a sequence of likelihood probablilites.
:param probs_seq: 2-D list of probabilities over the vocabulary for each
character. Each element is a list of float probabilities
for one character.
:type probs_seq: list
:param vocabulary: Vocabulary list.
:type vocabulary: list
:param method: Decoding method name, with options: "best_path".
:type method: basestring
:return: Decoding result string.
:rtype: baseline
"""
for
prob_list
in
probs_seq
:
if
not
len
(
prob_list
)
==
len
(
vocabulary
)
+
1
:
raise
ValueError
(
"probs dimension mismatchedd with vocabulary"
)
if
method
==
"best_path"
:
return
ctc_best_path_decode
(
probs_seq
,
vocabulary
)
else
:
raise
ValueError
(
"Decoding method [%s] is not supported."
)
infer.py
浏览文件 @
2a834865
...
...
@@ -3,12 +3,12 @@
"""
import
paddle.v2
as
paddle
from
itertools
import
groupby
import
distutils.util
import
argparse
import
gzip
from
audio_data_utils
import
DataGenerator
from
model
import
deep_speech2
from
decoder
import
ctc_decode
parser
=
argparse
.
ArgumentParser
(
description
=
'Simplified version of DeepSpeech2 inference.'
)
...
...
@@ -39,12 +39,12 @@ parser.add_argument(
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--normalizer_manifest_path"
,
default
=
'
.
/manifest.libri.train-clean-100'
,
default
=
'
data
/manifest.libri.train-clean-100'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_manifest_path"
,
default
=
'
.
/manifest.libri.test-clean'
,
default
=
'
data
/manifest.libri.test-clean'
,
type
=
str
,
help
=
"Manifest path for decoding. (default: %(default)s)"
)
parser
.
add_argument
(
...
...
@@ -52,34 +52,28 @@ parser.add_argument(
default
=
'./params.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'data/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
args
=
parser
.
parse_args
()
def
remove_duplicate_and_blank
(
id_list
,
blank_id
):
"""
Postprocessing for max-ctc-decoder.
- remove consecutive duplicate tokens.
- remove blanks.
"""
# remove consecutive duplicate tokens
id_list
=
[
x
[
0
]
for
x
in
groupby
(
id_list
)]
# remove blanks
return
[
id
for
id
in
id_list
if
id
!=
blank_id
]
def
best_path_decode
():
def
infer
():
"""
Max-ctc-decoding for DeepSpeech2.
"""
# initialize data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
'eng_vocab.txt'
,
vocab_filepath
=
args
.
vocab_filepath
,
normalizer_manifest_path
=
args
.
normalizer_manifest_path
,
normalizer_num_samples
=
200
,
max_duration
=
20.0
,
min_duration
=
0.0
,
stride_ms
=
10
,
window_ms
=
20
)
# create network config
dict_size
=
data_generator
.
vocabulary_size
()
vocab_list
=
data_generator
.
vocabulary_list
()
...
...
@@ -91,13 +85,14 @@ def best_path_decode():
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
_
,
max_id
=
deep_speech2
(
output_probs
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
dict_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
)
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
True
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
...
...
@@ -114,30 +109,28 @@ def best_path_decode():
shuffle
=
False
)
infer_data
=
test_batch_reader
().
next
()
# run max-ctc-decoding
max_id_results
=
paddle
.
infer
(
output_layer
=
max_id
,
parameters
=
parameters
,
input
=
infer_data
,
field
=
[
'id'
])
# postprocess
instance_length
=
len
(
max_id_results
)
/
args
.
num_samples
instance_list
=
[
max_id_results
[
i
*
instance_length
:(
i
+
1
)
*
instance_length
]
for
i
in
xrange
(
0
,
args
.
num_samples
)
# run inference
infer_results
=
paddle
.
infer
(
output_layer
=
output_probs
,
parameters
=
parameters
,
input
=
infer_data
)
num_steps
=
len
(
infer_results
)
/
len
(
infer_data
)
probs_split
=
[
infer_results
[
i
*
num_steps
:(
i
+
1
)
*
num_steps
]
for
i
in
xrange
(
0
,
len
(
infer_data
))
]
for
i
,
instance
in
enumerate
(
instance_list
):
id_list
=
remove_duplicate_and_blank
(
instance
,
dict_size
)
output_transcript
=
''
.
join
([
vocab_list
[
id
]
for
id
in
id_list
])
target_transcript
=
''
.
join
([
vocab_list
[
id
]
for
id
in
infer_data
[
i
][
1
]])
print
(
"Target Transcript: %s
\n
Output Transcript: %s
\n
"
%
(
target_transcript
,
output_transcript
))
# decode and print
for
i
,
probs
in
enumerate
(
probs_split
):
output_transcription
=
ctc_decode
(
probs_seq
=
probs
,
vocabulary
=
vocab_list
,
method
=
"best_path"
)
target_transcription
=
''
.
join
(
[
vocab_list
[
index
]
for
index
in
infer_data
[
i
][
1
]])
print
(
"Target Transcription: %s
\n
Output Transcription: %s
\n
"
%
(
target_transcription
,
output_transcription
))
def
main
():
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
best_path_decode
()
infer
()
if
__name__
==
'__main__'
:
...
...
model.py
浏览文件 @
2a834865
...
...
@@ -85,7 +85,8 @@ def deep_speech2(audio_data,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
rnn_size
=
256
):
rnn_size
=
256
,
is_inference
=
False
):
"""
The whole DeepSpeech2 model structure (a simplified version).
...
...
@@ -101,7 +102,12 @@ def deep_speech2(audio_data,
:type num_rnn_layers: int
:param rnn_size: RNN layer size (number of RNN cells).
:type rnn_size: int
:return: Tuple of the cost layer and the max_id decoder layer.
:param is_inference: False in the training mode, and True in the
inferene mode.
:type is_inference: bool
:return: If is_inference set False, return a ctc cost layer;
if is_inference set True, return a sequence layer of output
probability distribution.
:rtype: tuple of LayerOutput
"""
# convolution group
...
...
@@ -118,19 +124,21 @@ def deep_speech2(audio_data,
# rnn group
rnn_group_output
=
rnn_group
(
input
=
conv2seq
,
size
=
rnn_size
,
num_stacks
=
num_rnn_layers
)
# output token distribution
fc
=
paddle
.
layer
.
fc
(
input
=
rnn_group_output
,
size
=
dict_size
+
1
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
True
)
# ctc cost
cost
=
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
# max decoder
max_id
=
paddle
.
layer
.
max_id
(
input
=
fc
)
return
cost
,
max_id
if
is_inference
:
# probability distribution with softmax
return
paddle
.
layer
.
mixed
(
input
=
paddle
.
layer
.
identity_projection
(
input
=
fc
),
act
=
paddle
.
activation
.
Softmax
())
else
:
# ctc cost
return
paddle
.
layer
.
warp_ctc
(
input
=
fc
,
label
=
text_data
,
size
=
dict_size
+
1
,
blank
=
dict_size
,
norm_by_times
=
True
)
train.py
浏览文件 @
2a834865
...
...
@@ -60,19 +60,24 @@ parser.add_argument(
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--normalizer_manifest_path"
,
default
=
'
.
/manifest.libri.train-clean-100'
,
default
=
'
data
/manifest.libri.train-clean-100'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--train_manifest_path"
,
default
=
'
.
/manifest.libri.train-clean-100'
,
default
=
'
data
/manifest.libri.train-clean-100'
,
type
=
str
,
help
=
"Manifest path for training. (default: %(default)s)"
)
parser
.
add_argument
(
"--dev_manifest_path"
,
default
=
'
.
/manifest.libri.dev-clean'
,
default
=
'
data
/manifest.libri.dev-clean'
,
type
=
str
,
help
=
"Manifest path for validation. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'data/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
args
=
parser
.
parse_args
()
...
...
@@ -82,7 +87,7 @@ def train():
"""
# initialize data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
'eng_vocab.txt'
,
vocab_filepath
=
args
.
vocab_filepath
,
normalizer_manifest_path
=
args
.
normalizer_manifest_path
,
normalizer_num_samples
=
200
,
max_duration
=
20.0
,
...
...
@@ -100,13 +105,14 @@ def train():
text_data
=
paddle
.
layer
.
data
(
name
=
"transcript_text"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
cost
,
_
=
deep_speech2
(
cost
=
deep_speech2
(
audio_data
=
audio_data
,
text_data
=
text_data
,
dict_size
=
dict_size
,
num_conv_layers
=
args
.
num_conv_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_size
=
args
.
rnn_layer_size
)
rnn_size
=
args
.
rnn_layer_size
,
is_inference
=
False
)
# create parameters and optimizer
parameters
=
paddle
.
parameters
.
create
(
cost
)
...
...
@@ -118,21 +124,21 @@ def train():
# prepare data reader
train_batch_reader_sortagrad
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
train_manifest_path
,
batch_size
=
args
.
batch_size
//
args
.
trainer_count
,
batch_size
=
args
.
batch_size
,
padding_to
=
2000
,
flatten
=
True
,
sort_by_duration
=
True
,
shuffle
=
False
)
train_batch_reader_nosortagrad
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
train_manifest_path
,
batch_size
=
args
.
batch_size
//
args
.
trainer_count
,
batch_size
=
args
.
batch_size
,
padding_to
=
2000
,
flatten
=
True
,
sort_by_duration
=
False
,
shuffle
=
True
)
test_batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
dev_manifest_path
,
batch_size
=
args
.
batch_size
//
args
.
trainer_count
,
batch_size
=
args
.
batch_size
,
padding_to
=
2000
,
flatten
=
True
,
sort_by_duration
=
False
,
...
...
@@ -141,9 +147,7 @@ def train():
# create event handler
def
event_handler
(
event
):
global
start_time
global
cost_sum
global
cost_counter
global
start_time
,
cost_sum
,
cost_counter
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
cost_sum
+=
event
.
cost
cost_counter
+=
1
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录