Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
274899d3
M
models
项目概览
PaddlePaddle
/
models
接近 2 年 前同步成功
通知
230
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
274899d3
编写于
9月 04, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reduce the config parsing codes for DS2 and make it looks cleaner.
上级
53e59ee7
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
415 addition
and
691 deletion
+415
-691
deep_speech_2/decoder.py
deep_speech_2/decoder.py
+9
-6
deep_speech_2/demo_server.py
deep_speech_2/demo_server.py
+75
-112
deep_speech_2/evaluate.py
deep_speech_2/evaluate.py
+78
-128
deep_speech_2/infer.py
deep_speech_2/infer.py
+77
-128
deep_speech_2/model.py
deep_speech_2/model.py
+9
-10
deep_speech_2/train.py
deep_speech_2/train.py
+88
-147
deep_speech_2/tune.py
deep_speech_2/tune.py
+79
-135
deep_speech_2/utils.py
deep_speech_2/utils.py
+0
-25
未找到文件。
deep_speech_2/decoder.py
浏览文件 @
274899d3
...
@@ -9,8 +9,9 @@ from math import log
...
@@ -9,8 +9,9 @@ from math import log
import
multiprocessing
import
multiprocessing
def
ctc_best_path_decoder
(
probs_seq
,
vocabulary
):
def
ctc_greedy_decoder
(
probs_seq
,
vocabulary
):
"""Best path decoder, also called argmax decoder or greedy decoder.
"""CTC greedy (best path) decoder.
Path consisting of the most probable tokens are further post-processed to
Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks.
remove consecutive repetitions and all blanks.
...
@@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
...
@@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_prob
=
1.0
,
cutoff_prob
=
1.0
,
ext_scoring_func
=
None
,
ext_scoring_func
=
None
,
nproc
=
False
):
nproc
=
False
):
"""Beam search decoder for CTC-trained network. It utilizes beam search
"""CTC Beam search decoder.
to approximately select top best decoding labels and returning results
in the descending order. The implementation is based on Prefix
It utilizes beam search to approximately select top best decoding
Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is
labels and returning results in the descending order.
The implementation is based on Prefix Beam Search
(https://arxiv.org/abs/1408.2873), and the unclear part is
redesigned. Two important modifications: 1) in the iterative computation
redesigned. Two important modifications: 1) in the iterative computation
of probabilities, the assignment operation is changed to accumulation for
of probabilities, the assignment operation is changed to accumulation for
one prefix may comes from different paths; 2) the if condition "if l^+ not
one prefix may comes from different paths; 2) the if condition "if l^+ not
...
...
deep_speech_2/demo_server.py
浏览文件 @
274899d3
...
@@ -9,118 +9,74 @@ import SocketServer
...
@@ -9,118 +9,74 @@ import SocketServer
import
struct
import
struct
import
wave
import
wave
import
paddle.v2
as
paddle
import
paddle.v2
as
paddle
from
utils
import
print_arguments
from
data_utils.data
import
DataGenerator
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
model
import
DeepSpeech2Model
from
data_utils.utils
import
read_manifest
from
data_utils.utils
import
read_manifest
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--host_ip"
,
default
=
"localhost"
,
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
str
,
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
help
=
"Server IP address. (default: %(default)s)"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--"
+
argname
,
"--host_port"
,
default
=
default
,
default
=
8086
,
type
=
type
,
type
=
int
,
help
=
help
+
' Default: %(default)s.'
,
help
=
"Server Port. (default: %(default)s)"
)
**
kwargs
)
parser
.
add_argument
(
"--speech_save_dir"
,
default
=
"demo_cache"
,
# yapf: disable
type
=
str
,
# configurations of overall
help
=
"Directory for saving demo speech. (default: %(default)s)"
)
add_arg
(
'host_port'
,
int
,
8086
,
"Server's IP port."
)
parser
.
add_argument
(
add_arg
(
'host_ip'
,
str
,
"--vocab_filepath"
,
'localhost'
,
default
=
'datasets/vocab/eng_vocab.txt'
,
"Server's IP address."
)
type
=
str
,
add_arg
(
'speech_save_dir'
,
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
'demo_cache'
,
parser
.
add_argument
(
"Directory to save demo audios."
)
"--mean_std_filepath"
,
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
default
=
'mean_std.npz'
,
# configurations of decoder
type
=
str
,
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
add_arg
(
'alpha'
,
float
,
0.36
,
"Coef of LM for beam search."
)
parser
.
add_argument
(
add_arg
(
'beta'
,
float
,
0.25
,
"Coef of WC for beam search."
)
"--warmup_manifest_path"
,
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
default
=
'datasets/manifest.test'
,
add_arg
(
'lang_model_path'
,
str
,
type
=
str
,
'lm/data/common_crawl_00.prune01111.trie.klm'
,
help
=
"Manifest path for warmup test. (default: %(default)s)"
)
"Filepath for language model."
)
parser
.
add_argument
(
add_arg
(
'decoder_method'
,
str
,
"--specgram_type"
,
'ctc_beam_search'
,
default
=
'linear'
,
"Decoder method. Options: ctc_beam_search, ctc_greedy"
,
type
=
str
,
choices
=
[
'ctc_beam_search'
,
'ctc_greedy'
])
help
=
"Feature type of audio data: 'linear' (power spectrum)"
# configurations of data preprocess
" or 'mfcc'. (default: %(default)s)"
)
add_arg
(
'specgram_type'
,
str
,
parser
.
add_argument
(
'linear'
,
"--num_conv_layers"
,
"Audio feature type. Options: linear, mfcc."
,
default
=
2
,
choices
=
[
'linear'
,
'mfcc'
])
type
=
int
,
# configurations of model structure
help
=
"Convolution layer number. (default: %(default)s)"
)
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
parser
.
add_argument
(
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
"--num_rnn_layers"
,
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
default
=
3
,
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
type
=
int
,
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
help
=
"RNN layer number. (default: %(default)s)"
)
"bi-directional RNNs. Not for GRU."
)
parser
.
add_argument
(
# configurations of data io
"--rnn_layer_size"
,
add_arg
(
'warmup_manifest'
,
str
,
default
=
2048
,
'datasets/manifest.test'
,
type
=
int
,
"Filepath of manifest to warm up."
)
help
=
"RNN layer cell number. (default: %(default)s)"
)
add_arg
(
'mean_std_path'
,
str
,
parser
.
add_argument
(
'mean_std.npz'
,
"--share_rnn_weights"
,
"Filepath of normalizer's mean & std."
)
default
=
True
,
add_arg
(
'vocab_path'
,
str
,
type
=
distutils
.
util
.
strtobool
,
'datasets/vocab/eng_vocab.txt'
,
help
=
"Whether to share input-hidden weights between forword and backward "
"Filepath of vocabulary."
)
"directional simple RNNs. Only available when use_gru=False. "
# configurations of model io
"(default: %(default)s)"
)
add_arg
(
'model_path'
,
str
,
parser
.
add_argument
(
'./checkpoints/params.latest.tar.gz'
,
"--use_gru"
,
"If None, the training starts from scratch, "
default
=
False
,
"otherwise, it resumes from the pre-trained model."
)
type
=
distutils
.
util
.
strtobool
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_method"
,
default
=
'beam_search'
,
type
=
str
,
help
=
"Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
100
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha"
,
default
=
0.36
,
type
=
float
,
help
=
"Parameter associated with language model. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta"
,
default
=
0.25
,
type
=
float
,
help
=
"Parameter associated with word count. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# yapf: disable
class
AsrTCPServer
(
SocketServer
.
TCPServer
):
class
AsrTCPServer
(
SocketServer
.
TCPServer
):
...
@@ -200,8 +156,8 @@ def start_server():
...
@@ -200,8 +156,8 @@ def start_server():
"""Start the ASR server"""
"""Start the ASR server"""
# prepare data generator
# prepare data generator
data_generator
=
DataGenerator
(
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
1
)
num_threads
=
1
)
...
@@ -212,7 +168,7 @@ def start_server():
...
@@ -212,7 +168,7 @@ def start_server():
num_rnn_layers
=
args
.
num_rnn_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
share_rnn_weights
=
args
.
share_rnn_weights
)
# prepare ASR inference handler
# prepare ASR inference handler
...
@@ -220,13 +176,13 @@ def start_server():
...
@@ -220,13 +176,13 @@ def start_server():
feature
=
data_generator
.
process_utterance
(
filename
,
""
)
feature
=
data_generator
.
process_utterance
(
filename
,
""
)
result_transcript
=
ds2_model
.
infer_batch
(
result_transcript
=
ds2_model
.
infer_batch
(
infer_data
=
[
feature
],
infer_data
=
[
feature
],
decode
_method
=
args
.
decode
_method
,
decode
r_method
=
args
.
decoder
_method
,
beam_alpha
=
args
.
alpha
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
language_model_path
=
args
.
lang_model_path
,
num_processes
=
1
)
num_processes
=
1
)
return
result_transcript
[
0
]
return
result_transcript
[
0
]
...
@@ -235,7 +191,7 @@ def start_server():
...
@@ -235,7 +191,7 @@ def start_server():
print
(
'Warming up ...'
)
print
(
'Warming up ...'
)
warm_up_test
(
warm_up_test
(
audio_process_handler
=
file_to_transcript
,
audio_process_handler
=
file_to_transcript
,
manifest_path
=
args
.
warmup_manifest
_path
,
manifest_path
=
args
.
warmup_manifest
,
num_test_cases
=
3
)
num_test_cases
=
3
)
print
(
'-----------------------------------------------------------'
)
print
(
'-----------------------------------------------------------'
)
...
@@ -249,6 +205,13 @@ def start_server():
...
@@ -249,6 +205,13 @@ def start_server():
server
.
serve_forever
()
server
.
serve_forever
()
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
def
main
():
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
...
...
deep_speech_2/evaluate.py
浏览文件 @
274899d3
...
@@ -10,140 +10,83 @@ import paddle.v2 as paddle
...
@@ -10,140 +10,83 @@ import paddle.v2 as paddle
from
data_utils.data
import
DataGenerator
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
,
cer
from
error_rate
import
wer
,
cer
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--batch_size"
,
default
=
128
,
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
int
,
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
help
=
"Minibatch size for evaluation. (default: %(default)s)"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--"
+
argname
,
"--trainer_count"
,
default
=
default
,
default
=
8
,
type
=
type
,
type
=
int
,
help
=
help
+
' Default: %(default)s.'
,
help
=
"Trainer number. (default: %(default)s)"
)
**
kwargs
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
# yapf: disable
type
=
int
,
# configurations of overall
help
=
"Convolution layer number. (default: %(default)s)"
)
add_arg
(
'batch_size'
,
int
,
128
,
"Minibatch size."
)
parser
.
add_argument
(
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
"--num_rnn_layers"
,
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
default
=
3
,
add_arg
(
'error_rate_type'
,
str
,
'wer'
,
"Error rate type for evaluation."
,
type
=
int
,
choices
=
[
'wer'
,
'cer'
])
help
=
"RNN layer number. (default: %(default)s)"
)
# configurations of decoder
parser
.
add_argument
(
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
"--rnn_layer_size"
,
add_arg
(
'alpha'
,
float
,
0.36
,
"Coef of LM for beam search."
)
default
=
2048
,
add_arg
(
'beta'
,
float
,
0.25
,
"Coef of WC for beam search."
)
type
=
int
,
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
help
=
"RNN layer cell number. (default: %(default)s)"
)
add_arg
(
'parallels_bsearch'
,
int
,
NUM_CPU
,
"# of CPUs for beam search."
)
parser
.
add_argument
(
add_arg
(
'lang_model_path'
,
str
,
"--share_rnn_weights"
,
'lm/data/common_crawl_00.prune01111.trie.klm'
,
default
=
True
,
"Filepath for language model."
)
type
=
distutils
.
util
.
strtobool
,
add_arg
(
'decoder_method'
,
str
,
help
=
"Whether to share input-hidden weights between forword and backward "
'ctc_beam_search'
,
"directional simple RNNs. Only available when use_gru=False. "
"Decoder method. Options: ctc_beam_search, ctc_greedy"
,
"(default: %(default)s)"
)
choices
=
[
'ctc_beam_search'
,
'ctc_greedy'
])
parser
.
add_argument
(
# configurations of data preprocess
"--use_gru"
,
add_arg
(
'parallels_data'
,
int
,
NUM_CPU
,
"# of CPUs for data preprocessing."
)
default
=
False
,
add_arg
(
'specgram_type'
,
str
,
type
=
distutils
.
util
.
strtobool
,
'linear'
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
"Audio feature type. Options: linear, mfcc."
,
parser
.
add_argument
(
choices
=
[
'linear'
,
'mfcc'
])
"--use_gpu"
,
# configurations of model structure
default
=
True
,
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
type
=
distutils
.
util
.
strtobool
,
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
help
=
"Use gpu or not. (default: %(default)s)"
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
parser
.
add_argument
(
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
"--num_threads_data"
,
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
default
=
multiprocessing
.
cpu_count
()
//
2
,
"bi-directional RNNs. Not for GRU."
)
type
=
int
,
# configurations of data io
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
add_arg
(
'test_manifest'
,
str
,
parser
.
add_argument
(
'datasets/manifest.test'
,
"--num_processes_beam_search"
,
"Filepath of manifest to evaluate."
)
default
=
multiprocessing
.
cpu_count
()
//
2
,
add_arg
(
'mean_std_path'
,
str
,
type
=
int
,
'mean_std.npz'
,
help
=
"Number of cpu processes for beam search. (default: %(default)s)"
)
"Filepath of normalizer's mean & std."
)
parser
.
add_argument
(
add_arg
(
'vocab_path'
,
str
,
"--mean_std_filepath"
,
'datasets/vocab/eng_vocab.txt'
,
default
=
'mean_std.npz'
,
"Filepath of vocabulary."
)
type
=
str
,
# configurations of model io
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
add_arg
(
'model_path'
,
str
,
parser
.
add_argument
(
'./checkpoints/params.latest.tar.gz'
,
"--decode_method"
,
"If None, the training starts from scratch, "
default
=
'beam_search'
,
"otherwise, it resumes from the pre-trained model."
)
type
=
str
,
help
=
"Method for ctc decoding, best_path or beam_search. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha"
,
default
=
0.36
,
type
=
float
,
help
=
"Parameter associated with language model. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta"
,
default
=
0.25
,
type
=
float
,
help
=
"Parameter associated with word count. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--specgram_type"
,
default
=
'linear'
,
type
=
str
,
help
=
"Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_manifest_path"
,
default
=
'datasets/manifest.test'
,
type
=
str
,
help
=
"Manifest path for decoding. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--error_rate_type"
,
default
=
'wer'
,
choices
=
[
'wer'
,
'cer'
],
type
=
str
,
help
=
"Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# yapf: disable
def
evaluate
():
def
evaluate
():
"""Evaluate on whole test data for DeepSpeech2."""
"""Evaluate on whole test data for DeepSpeech2."""
data_generator
=
DataGenerator
(
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_thread
s_data
)
num_threads
=
args
.
parallel
s_data
)
batch_reader
=
data_generator
.
batch_reader_creator
(
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
manifest_path
=
args
.
test_manifest
,
batch_size
=
args
.
batch_size
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
1
,
min_batch_size
=
1
,
sortagrad
=
False
,
sortagrad
=
False
,
...
@@ -155,7 +98,7 @@ def evaluate():
...
@@ -155,7 +98,7 @@ def evaluate():
num_rnn_layers
=
args
.
num_rnn_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
share_rnn_weights
=
args
.
share_rnn_weights
)
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
...
@@ -163,14 +106,14 @@ def evaluate():
...
@@ -163,14 +106,14 @@ def evaluate():
for
infer_data
in
batch_reader
():
for
infer_data
in
batch_reader
():
result_transcripts
=
ds2_model
.
infer_batch
(
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
infer_data
,
infer_data
=
infer_data
,
decode
_method
=
args
.
decode
_method
,
decode
r_method
=
args
.
decoder
_method
,
beam_alpha
=
args
.
alpha
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
num_processes_beam_
search
)
num_processes
=
args
.
parallels_b
search
)
target_transcripts
=
[
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
for
_
,
transcript
in
infer_data
...
@@ -184,8 +127,15 @@ def evaluate():
...
@@ -184,8 +127,15 @@ def evaluate():
(
args
.
error_rate_type
,
num_ins
,
num_ins
,
error_sum
/
num_ins
))
(
args
.
error_rate_type
,
num_ins
,
num_ins
,
error_sum
/
num_ins
))
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
evaluate
()
evaluate
()
...
...
deep_speech_2/infer.py
浏览文件 @
274899d3
...
@@ -10,140 +10,82 @@ import paddle.v2 as paddle
...
@@ -10,140 +10,82 @@ import paddle.v2 as paddle
from
data_utils.data
import
DataGenerator
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
,
cer
from
error_rate
import
wer
,
cer
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--num_samples"
,
default
=
10
,
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
int
,
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
help
=
"Number of samples for inference. (default: %(default)s)"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--"
+
argname
,
"--num_conv_layers"
,
default
=
default
,
default
=
2
,
type
=
type
,
type
=
int
,
help
=
help
+
' Default: %(default)s.'
,
help
=
"Convolution layer number. (default: %(default)s)"
)
**
kwargs
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
# yapf: disable
type
=
int
,
# configurations of overall
help
=
"RNN layer number. (default: %(default)s)"
)
add_arg
(
'num_samples'
,
int
,
10
,
"# of samples to infer."
)
parser
.
add_argument
(
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
"--rnn_layer_size"
,
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
default
=
2048
,
add_arg
(
'error_rate_type'
,
str
,
'wer'
,
"Error rate type for evaluation."
,
type
=
int
,
choices
=
[
'wer'
,
'cer'
])
help
=
"RNN layer cell number. (default: %(default)s)"
)
# configurations of decoder
parser
.
add_argument
(
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
"--share_rnn_weights"
,
add_arg
(
'alpha'
,
float
,
0.36
,
"Coef of LM for beam search."
)
default
=
True
,
add_arg
(
'beta'
,
float
,
0.25
,
"Coef of WC for beam search."
)
type
=
distutils
.
util
.
strtobool
,
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
help
=
"Whether to share input-hidden weights between forword and backward "
add_arg
(
'parallels_bsearch'
,
int
,
NUM_CPU
,
"# of CPUs for beam search."
)
"directional simple RNNs. Only available when use_gru=False. "
add_arg
(
'lang_model_path'
,
str
,
"(default: %(default)s)"
)
'lm/data/common_crawl_00.prune01111.trie.klm'
,
parser
.
add_argument
(
"Filepath for language model."
)
"--use_gru"
,
add_arg
(
'decoder_method'
,
str
,
default
=
False
,
'ctc_beam_search'
,
type
=
distutils
.
util
.
strtobool
,
"Decoder method. Options: ctc_beam_search, ctc_greedy"
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
choices
=
[
'ctc_beam_search'
,
'ctc_greedy'
])
parser
.
add_argument
(
# configurations of data preprocess
"--use_gpu"
,
add_arg
(
'specgram_type'
,
str
,
default
=
True
,
'linear'
,
type
=
distutils
.
util
.
strtobool
,
"Audio feature type. Options: linear, mfcc."
,
help
=
"Use gpu or not. (default: %(default)s)"
)
choices
=
[
'linear'
,
'mfcc'
])
parser
.
add_argument
(
# configurations of model structure
"--num_threads_data"
,
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
default
=
1
,
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
type
=
int
,
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
parser
.
add_argument
(
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
"--num_processes_beam_search"
,
"bi-directional RNNs. Not for GRU."
)
default
=
multiprocessing
.
cpu_count
()
//
2
,
# configurations of data io
type
=
int
,
add_arg
(
'infer_manifest'
,
str
,
help
=
"Number of cpu processes for beam search. (default: %(default)s)"
)
'datasets/manifest.dev'
,
parser
.
add_argument
(
"Filepath of manifest to infer."
)
"--specgram_type"
,
add_arg
(
'mean_std_path'
,
str
,
default
=
'linear'
,
'mean_std.npz'
,
type
=
str
,
"Filepath of normalizer's mean & std."
)
help
=
"Feature type of audio data: 'linear' (power spectrum)"
add_arg
(
'vocab_path'
,
str
,
" or 'mfcc'. (default: %(default)s)"
)
'datasets/vocab/eng_vocab.txt'
,
parser
.
add_argument
(
"Filepath of vocabulary."
)
"--trainer_count"
,
# configurations of model io
default
=
8
,
add_arg
(
'model_path'
,
str
,
type
=
int
,
'./checkpoints/params.latest.tar.gz'
,
help
=
"Trainer number. (default: %(default)s)"
)
"If None, the training starts from scratch, "
parser
.
add_argument
(
"otherwise, it resumes from the pre-trained model."
)
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_manifest_path"
,
default
=
'datasets/manifest.test'
,
type
=
str
,
help
=
"Manifest path for decoding. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_method"
,
default
=
'beam_search'
,
type
=
str
,
help
=
"Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha"
,
default
=
0.36
,
type
=
float
,
help
=
"Parameter associated with language model. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta"
,
default
=
0.25
,
type
=
float
,
help
=
"Parameter associated with word count. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
parser
.
add_argument
(
"--error_rate_type"
,
default
=
'wer'
,
choices
=
[
'wer'
,
'cer'
],
type
=
str
,
help
=
"Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# yapf: disable
def
infer
():
def
infer
():
"""Inference for DeepSpeech2."""
"""Inference for DeepSpeech2."""
data_generator
=
DataGenerator
(
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
num_threads
=
1
)
batch_reader
=
data_generator
.
batch_reader_creator
(
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
manifest_path
=
args
.
infer_manifest
,
batch_size
=
args
.
num_samples
,
batch_size
=
args
.
num_samples
,
min_batch_size
=
1
,
min_batch_size
=
1
,
sortagrad
=
False
,
sortagrad
=
False
,
...
@@ -156,18 +98,18 @@ def infer():
...
@@ -156,18 +98,18 @@ def infer():
num_rnn_layers
=
args
.
num_rnn_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
share_rnn_weights
=
args
.
share_rnn_weights
)
result_transcripts
=
ds2_model
.
infer_batch
(
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
infer_data
,
infer_data
=
infer_data
,
decode
_method
=
args
.
decode
_method
,
decode
r_method
=
args
.
decoder
_method
,
beam_alpha
=
args
.
alpha
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
num_processes_beam_
search
)
num_processes
=
args
.
parallels_b
search
)
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
target_transcripts
=
[
target_transcripts
=
[
...
@@ -181,8 +123,15 @@ def infer():
...
@@ -181,8 +123,15 @@ def infer():
(
args
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
(
args
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
infer
()
infer
()
...
...
deep_speech_2/model.py
浏览文件 @
274899d3
...
@@ -146,7 +146,7 @@ class DeepSpeech2Model(object):
...
@@ -146,7 +146,7 @@ class DeepSpeech2Model(object):
# run inference
# run inference
return
self
.
_loss_inferer
.
infer
(
input
=
infer_data
)
return
self
.
_loss_inferer
.
infer
(
input
=
infer_data
)
def
infer_batch
(
self
,
infer_data
,
decode_method
,
beam_alpha
,
beam_beta
,
def
infer_batch
(
self
,
infer_data
,
decode
r
_method
,
beam_alpha
,
beam_beta
,
beam_size
,
cutoff_prob
,
vocab_list
,
language_model_path
,
beam_size
,
cutoff_prob
,
vocab_list
,
language_model_path
,
num_processes
):
num_processes
):
"""Model inference. Infer the transcription for a batch of speech
"""Model inference. Infer the transcription for a batch of speech
...
@@ -156,9 +156,9 @@ class DeepSpeech2Model(object):
...
@@ -156,9 +156,9 @@ class DeepSpeech2Model(object):
consisting of a tuple of audio features and
consisting of a tuple of audio features and
transcription text (empty string).
transcription text (empty string).
:type infer_data: list
:type infer_data: list
:param decode
_method: Decoding method name, 'best_path
' or
:param decode
r_method: Decoding method name, 'ctc_greedy
' or
'beam
search'.
'ctc_beam_
search'.
:param decode_method: string
:param decode
r
_method: string
:param beam_alpha: Parameter associated with language model.
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
:param beam_beta: Parameter associated with word count.
...
@@ -190,13 +190,13 @@ class DeepSpeech2Model(object):
...
@@ -190,13 +190,13 @@ class DeepSpeech2Model(object):
]
]
# run decoder
# run decoder
results
=
[]
results
=
[]
if
decode
_method
==
"best_path
"
:
if
decode
r_method
==
"ctc_greedy
"
:
# best path decode
# best path decode
for
i
,
probs
in
enumerate
(
probs_split
):
for
i
,
probs
in
enumerate
(
probs_split
):
output_transcription
=
ctc_
best_path
_decoder
(
output_transcription
=
ctc_
greedy
_decoder
(
probs_seq
=
probs
,
vocabulary
=
vocab_list
)
probs_seq
=
probs
,
vocabulary
=
vocab_list
)
results
.
append
(
output_transcription
)
results
.
append
(
output_transcription
)
elif
decode
_method
==
"
beam_search"
:
elif
decode
r_method
==
"ctc_
beam_search"
:
# initialize external scorer
# initialize external scorer
if
self
.
_ext_scorer
==
None
:
if
self
.
_ext_scorer
==
None
:
self
.
_ext_scorer
=
LmScorer
(
beam_alpha
,
beam_beta
,
self
.
_ext_scorer
=
LmScorer
(
beam_alpha
,
beam_beta
,
...
@@ -205,7 +205,6 @@ class DeepSpeech2Model(object):
...
@@ -205,7 +205,6 @@ class DeepSpeech2Model(object):
else
:
else
:
self
.
_ext_scorer
.
reset_params
(
beam_alpha
,
beam_beta
)
self
.
_ext_scorer
.
reset_params
(
beam_alpha
,
beam_beta
)
assert
self
.
_loaded_lm_path
==
language_model_path
assert
self
.
_loaded_lm_path
==
language_model_path
# beam search decode
# beam search decode
beam_search_results
=
ctc_beam_search_decoder_batch
(
beam_search_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
probs_split
=
probs_split
,
...
@@ -218,8 +217,8 @@ class DeepSpeech2Model(object):
...
@@ -218,8 +217,8 @@ class DeepSpeech2Model(object):
results
=
[
result
[
0
][
1
]
for
result
in
beam_search_results
]
results
=
[
result
[
0
][
1
]
for
result
in
beam_search_results
]
else
:
else
:
raise
ValueError
(
"Decod
ing
method [%s] is not supported."
%
raise
ValueError
(
"Decod
er
method [%s] is not supported."
%
decode_method
)
decode
r
_method
)
return
results
return
results
def
_create_parameters
(
self
,
model_path
=
None
):
def
_create_parameters
(
self
,
model_path
=
None
):
...
...
deep_speech_2/train.py
浏览文件 @
274899d3
...
@@ -9,169 +9,103 @@ import multiprocessing
...
@@ -9,169 +9,103 @@ import multiprocessing
import
paddle.v2
as
paddle
import
paddle.v2
as
paddle
from
model
import
DeepSpeech2Model
from
model
import
DeepSpeech2Model
from
data_utils.data
import
DataGenerator
from
data_utils.data
import
DataGenerator
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--batch_size"
,
default
=
256
,
type
=
int
,
help
=
"Minibatch size."
)
parser
.
add_argument
(
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
"--num_passes"
,
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
default
=
200
,
parser
.
add_argument
(
type
=
int
,
"--"
+
argname
,
help
=
"Training pass number. (default: %(default)s)"
)
default
=
default
,
parser
.
add_argument
(
type
=
type
,
"--num_iterations_print"
,
help
=
help
+
' Default: %(default)s.'
,
default
=
100
,
**
kwargs
)
type
=
int
,
help
=
"Number of iterations for every train cost printing. "
"(default: %(default)s)"
)
# yapf: disable
parser
.
add_argument
(
# configurations of optimization
"--num_conv_layers"
,
add_arg
(
'batch_size'
,
int
,
256
,
"Minibatch size."
)
default
=
2
,
add_arg
(
'learning_rate'
,
float
,
5e-4
,
"Learning rate."
)
type
=
int
,
add_arg
(
'use_sortagrad'
,
bool
,
True
,
"Use SortaGrad or not."
)
help
=
"Convolution layer number. (default: %(default)s)"
)
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
parser
.
add_argument
(
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
"--num_rnn_layers"
,
add_arg
(
'num_passes'
,
int
,
200
,
"# of training epochs."
)
default
=
3
,
add_arg
(
'is_local'
,
bool
,
True
,
"Use pserver or not."
)
type
=
int
,
add_arg
(
'num_iter_print'
,
int
,
100
,
"Every # iterations for printing "
help
=
"RNN layer number. (default: %(default)s)"
)
"train cost."
)
parser
.
add_argument
(
# configurations of data preprocess
"--rnn_layer_size"
,
add_arg
(
'max_duration'
,
float
,
27.0
,
"Longest audio duration allowed."
)
default
=
2048
,
add_arg
(
'min_duration'
,
float
,
0.0
,
"Shortest audio duration allowed."
)
type
=
int
,
add_arg
(
'parallels_data'
,
int
,
NUM_CPU
,
"# of CPUs for data preprocessing."
)
help
=
"RNN layer cell number. (default: %(default)s)"
)
add_arg
(
'specgram_type'
,
str
,
parser
.
add_argument
(
'linear'
,
"--share_rnn_weights"
,
"Audio feature type. Options: linear, mfcc."
,
default
=
True
,
choices
=
[
'linear'
,
'mfcc'
])
type
=
distutils
.
util
.
strtobool
,
add_arg
(
'augment_conf_path'
,
str
,
help
=
"Whether to share input-hidden weights between forword and backward "
'conf/augmentation.config'
,
"directional simple RNNs. Only available when use_gru=False. "
"Filepath of augmentation configuration file (json-format)."
)
"(default: %(default)s)"
)
add_arg
(
'shuffle_method'
,
str
,
parser
.
add_argument
(
'batch_shuffle_clipped'
,
"--use_gru"
,
"Shuffle method."
,
default
=
False
,
choices
=
[
'instance_shuffle'
,
'batch_shuffle'
,
'batch_shuffle_clipped'
])
type
=
distutils
.
util
.
strtobool
,
# configurations of model structure
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
parser
.
add_argument
(
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
"--adam_learning_rate"
,
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
default
=
5e-4
,
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
type
=
float
,
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
help
=
"Learning rate for ADAM Optimizer. (default: %(default)s)"
)
"bi-directional RNNs. Not for GRU."
)
parser
.
add_argument
(
# configurations of data io
"--use_gpu"
,
add_arg
(
'train_manifest'
,
str
,
default
=
True
,
'datasets/manifest.train'
,
type
=
distutils
.
util
.
strtobool
,
"Filepath of train manifest."
)
help
=
"Use gpu or not. (default: %(default)s)"
)
add_arg
(
'dev_manifest'
,
str
,
parser
.
add_argument
(
'datasets/manifest.dev'
,
"--use_sortagrad"
,
"Filepath of validation manifest."
)
default
=
True
,
add_arg
(
'mean_std_path'
,
str
,
type
=
distutils
.
util
.
strtobool
,
'mean_std.npz'
,
help
=
"Use sortagrad or not. (default: %(default)s)"
)
"Filepath of normalizer's mean & std."
)
parser
.
add_argument
(
add_arg
(
'vocab_path'
,
str
,
"--specgram_type"
,
'datasets/vocab/eng_vocab.txt'
,
default
=
'linear'
,
"Filepath of vocabulary."
)
type
=
str
,
# configurations of model io
help
=
"Feature type of audio data: 'linear' (power spectrum)"
add_arg
(
'init_model_path'
,
str
,
" or 'mfcc'. (default: %(default)s)"
)
None
,
parser
.
add_argument
(
"If None, the training starts from scratch, "
"--max_duration"
,
"otherwise, it resumes from the pre-trained model."
)
default
=
27.0
,
add_arg
(
'output_model_dir'
,
str
,
type
=
float
,
"./checkpoints"
,
help
=
"Audios with duration larger than this will be discarded. "
"Directory for saving checkpoints."
)
"(default: %(default)s)"
)
parser
.
add_argument
(
"--min_duration"
,
default
=
0.0
,
type
=
float
,
help
=
"Audios with duration smaller than this will be discarded. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--shuffle_method"
,
default
=
'batch_shuffle_clipped'
,
type
=
str
,
help
=
"Shuffle method: 'instance_shuffle', 'batch_shuffle', "
"'batch_shuffle_batch'. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
multiprocessing
.
cpu_count
()
//
2
,
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--train_manifest_path"
,
default
=
'datasets/manifest.train'
,
type
=
str
,
help
=
"Manifest path for training. (default: %(default)s)"
)
parser
.
add_argument
(
"--dev_manifest_path"
,
default
=
'datasets/manifest.dev'
,
type
=
str
,
help
=
"Manifest path for validation. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--init_model_path"
,
default
=
None
,
type
=
str
,
help
=
"If set None, the training will start from scratch. "
"Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)"
)
parser
.
add_argument
(
"--output_model_dir"
,
default
=
"./checkpoints"
,
type
=
str
,
help
=
"Directory for saving models. (default: %(default)s)"
)
parser
.
add_argument
(
"--augmentation_config"
,
default
=
open
(
'conf/augmentation.config'
,
'r'
).
read
(),
type
=
str
,
help
=
"Augmentation configuration in json-format. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--is_local"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# yapf: disable
def
train
():
def
train
():
"""DeepSpeech2 training."""
"""DeepSpeech2 training."""
train_generator
=
DataGenerator
(
train_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
args
.
augmentation_config
,
augmentation_config
=
open
(
args
.
augment_conf_path
,
'r'
).
read
()
,
max_duration
=
args
.
max_duration
,
max_duration
=
args
.
max_duration
,
min_duration
=
args
.
min_duration
,
min_duration
=
args
.
min_duration
,
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_thread
s_data
)
num_threads
=
args
.
parallel
s_data
)
dev_generator
=
DataGenerator
(
dev_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
"{}"
,
augmentation_config
=
"{}"
,
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_thread
s_data
)
num_threads
=
args
.
parallel
s_data
)
train_batch_reader
=
train_generator
.
batch_reader_creator
(
train_batch_reader
=
train_generator
.
batch_reader_creator
(
manifest_path
=
args
.
train_manifest
_path
,
manifest_path
=
args
.
train_manifest
,
batch_size
=
args
.
batch_size
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
args
.
trainer_count
,
min_batch_size
=
args
.
trainer_count
,
sortagrad
=
args
.
use_sortagrad
if
args
.
init_model_path
is
None
else
False
,
sortagrad
=
args
.
use_sortagrad
if
args
.
init_model_path
is
None
else
False
,
shuffle_method
=
args
.
shuffle_method
)
shuffle_method
=
args
.
shuffle_method
)
dev_batch_reader
=
dev_generator
.
batch_reader_creator
(
dev_batch_reader
=
dev_generator
.
batch_reader_creator
(
manifest_path
=
args
.
dev_manifest
_path
,
manifest_path
=
args
.
dev_manifest
,
batch_size
=
args
.
batch_size
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
1
,
# must be 1, but will have errors.
min_batch_size
=
1
,
# must be 1, but will have errors.
sortagrad
=
False
,
sortagrad
=
False
,
...
@@ -184,21 +118,28 @@ def train():
...
@@ -184,21 +118,28 @@ def train():
rnn_layer_size
=
args
.
rnn_layer_size
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
init_model_path
,
pretrained_model_path
=
args
.
init_model_path
,
share_rnn_weights
=
args
.
share_
rnn_
weights
)
share_rnn_weights
=
args
.
share_weights
)
ds2_model
.
train
(
ds2_model
.
train
(
train_batch_reader
=
train_batch_reader
,
train_batch_reader
=
train_batch_reader
,
dev_batch_reader
=
dev_batch_reader
,
dev_batch_reader
=
dev_batch_reader
,
feeding_dict
=
train_generator
.
feeding
,
feeding_dict
=
train_generator
.
feeding
,
learning_rate
=
args
.
adam_
learning_rate
,
learning_rate
=
args
.
learning_rate
,
gradient_clipping
=
400
,
gradient_clipping
=
400
,
num_passes
=
args
.
num_passes
,
num_passes
=
args
.
num_passes
,
num_iterations_print
=
args
.
num_iter
ations
_print
,
num_iterations_print
=
args
.
num_iter_print
,
output_model_dir
=
args
.
output_model_dir
,
output_model_dir
=
args
.
output_model_dir
,
is_local
=
args
.
is_local
)
is_local
=
args
.
is_local
)
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
train
()
train
()
...
...
deep_speech_2/tune.py
浏览文件 @
274899d3
"""
P
arameters tuning for DeepSpeech2 model."""
"""
Beam search p
arameters tuning for DeepSpeech2 model."""
from
__future__
import
absolute_import
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
division
from
__future__
import
print_function
from
__future__
import
print_function
...
@@ -11,134 +11,71 @@ import paddle.v2 as paddle
...
@@ -11,134 +11,71 @@ import paddle.v2 as paddle
from
data_utils.data
import
DataGenerator
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
from
error_rate
import
wer
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--num_samples"
,
default
=
100
,
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
int
,
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
help
=
"Number of samples for parameters tuning. (default: %(default)s)"
)
parser
.
add_argument
(
parser
.
add_argument
(
"--"
+
argname
,
"--num_conv_layers"
,
default
=
default
,
default
=
2
,
type
=
type
,
type
=
int
,
help
=
help
+
' Default: %(default)s.'
,
help
=
"Convolution layer number. (default: %(default)s)"
)
**
kwargs
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
# yapf: disable
type
=
int
,
# configurations of overall
help
=
"RNN layer number. (default: %(default)s)"
)
add_arg
(
'num_samples'
,
int
,
100
,
"# of samples to infer."
)
parser
.
add_argument
(
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
"--rnn_layer_size"
,
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
default
=
2048
,
add_arg
(
'error_rate_type'
,
str
,
'wer'
,
"Error rate type for evaluation."
,
type
=
int
,
choices
=
[
'wer'
,
'cer'
])
help
=
"RNN layer cell number. (default: %(default)s)"
)
# configurations of tuning parameters
parser
.
add_argument
(
add_arg
(
'alpha_from'
,
float
,
0.1
,
"Where alpha starts tuning from."
)
"--share_rnn_weights"
,
add_arg
(
'alpha_to'
,
float
,
0.36
,
"Where alpha ends tuning with."
)
default
=
True
,
add_arg
(
'num_alphas'
,
int
,
14
,
"# of alpha candidates for tuning."
)
type
=
distutils
.
util
.
strtobool
,
add_arg
(
'beta_from'
,
float
,
0.05
,
"Where beta starts tuning from."
)
help
=
"Whether to share input-hidden weights between forword and backward "
add_arg
(
'beta_to'
,
float
,
0.36
,
"Where beta ends tuning with."
)
"directional simple RNNs. Only available when use_gru=False. "
add_arg
(
'num_betas'
,
int
,
20
,
"# of beta candidates for tuning."
)
"(default: %(default)s)"
)
# configurations of decoder
parser
.
add_argument
(
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
"--use_gru"
,
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
default
=
False
,
add_arg
(
'parallels_bsearch'
,
int
,
NUM_CPU
,
"# of CPUs for beam search."
)
type
=
distutils
.
util
.
strtobool
,
add_arg
(
'lang_model_path'
,
str
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
'lm/data/common_crawl_00.prune01111.trie.klm'
,
parser
.
add_argument
(
"Filepath for language model."
)
"--use_gpu"
,
# configurations of data preprocess
default
=
True
,
add_arg
(
'specgram_type'
,
str
,
type
=
distutils
.
util
.
strtobool
,
'linear'
,
help
=
"Use gpu or not. (default: %(default)s)"
)
"Audio feature type. Options: linear, mfcc."
,
parser
.
add_argument
(
choices
=
[
'linear'
,
'mfcc'
])
"--trainer_count"
,
# configurations of model structure
default
=
8
,
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
type
=
int
,
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
help
=
"Trainer number. (default: %(default)s)"
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
parser
.
add_argument
(
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
"--num_threads_data"
,
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
default
=
1
,
"bi-directional RNNs. Not for GRU."
)
type
=
int
,
# configurations of data io
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
add_arg
(
'tune_manifest'
,
str
,
parser
.
add_argument
(
'datasets/manifest.test'
,
"--num_processes_beam_search"
,
"Filepath of manifest to tune."
)
default
=
multiprocessing
.
cpu_count
()
//
2
,
add_arg
(
'mean_std_path'
,
str
,
type
=
int
,
'mean_std.npz'
,
help
=
"Number of cpu processes for beam search. (default: %(default)s)"
)
"Filepath of normalizer's mean & std."
)
parser
.
add_argument
(
add_arg
(
'vocab_path'
,
str
,
"--specgram_type"
,
'datasets/vocab/eng_vocab.txt'
,
default
=
'linear'
,
"Filepath of vocabulary."
)
type
=
str
,
# configurations of model io
help
=
"Feature type of audio data: 'linear' (power spectrum)"
add_arg
(
'model_path'
,
str
,
" or 'mfcc'. (default: %(default)s)"
)
'./checkpoints/params.latest.tar.gz'
,
parser
.
add_argument
(
"If None, the training starts from scratch, "
"--mean_std_filepath"
,
"otherwise, it resumes from the pre-trained model."
)
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--tune_manifest_path"
,
default
=
'datasets/manifest.dev'
,
type
=
str
,
help
=
"Manifest path for tuning. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha_from"
,
default
=
0.1
,
type
=
float
,
help
=
"Where alpha starts from. (default: %(default)f)"
)
parser
.
add_argument
(
"--num_alphas"
,
default
=
14
,
type
=
int
,
help
=
"Number of candidate alphas. (default: %(default)d)"
)
parser
.
add_argument
(
"--alpha_to"
,
default
=
0.36
,
type
=
float
,
help
=
"Where alpha ends with. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta_from"
,
default
=
0.05
,
type
=
float
,
help
=
"Where beta starts from. (default: %(default)f)"
)
parser
.
add_argument
(
"--num_betas"
,
default
=
20
,
type
=
float
,
help
=
"Number of candidate betas. (default: %(default)d)"
)
parser
.
add_argument
(
"--beta_to"
,
default
=
1.0
,
type
=
float
,
help
=
"Where beta ends with. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
# yapf: disable
def
tune
():
def
tune
():
...
@@ -149,13 +86,13 @@ def tune():
...
@@ -149,13 +86,13 @@ def tune():
raise
ValueError
(
"num_betas must be non-negative!"
)
raise
ValueError
(
"num_betas must be non-negative!"
)
data_generator
=
DataGenerator
(
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
num_threads
=
1
)
batch_reader
=
data_generator
.
batch_reader_creator
(
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
tune_manifest
_path
,
manifest_path
=
args
.
tune_manifest
,
batch_size
=
args
.
num_samples
,
batch_size
=
args
.
num_samples
,
sortagrad
=
False
,
sortagrad
=
False
,
shuffle_method
=
None
)
shuffle_method
=
None
)
...
@@ -171,7 +108,7 @@ def tune():
...
@@ -171,7 +108,7 @@ def tune():
num_rnn_layers
=
args
.
num_rnn_layers
,
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
share_rnn_weights
=
args
.
share_rnn_weights
)
# create grid for search
# create grid for search
...
@@ -184,14 +121,14 @@ def tune():
...
@@ -184,14 +121,14 @@ def tune():
for
alpha
,
beta
in
params_grid
:
for
alpha
,
beta
in
params_grid
:
result_transcripts
=
ds2_model
.
infer_batch
(
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
tune_data
,
infer_data
=
tune_data
,
decode
_method
=
'
beam_search'
,
decode
r_method
=
'ctc_
beam_search'
,
beam_alpha
=
alpha
,
beam_alpha
=
alpha
,
beam_beta
=
beta
,
beam_beta
=
beta
,
beam_size
=
args
.
beam_size
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
num_processes_beam_
search
)
num_processes
=
args
.
parallels_b
search
)
wer_sum
,
num_ins
=
0.0
,
0
wer_sum
,
num_ins
=
0.0
,
0
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
wer_sum
+=
wer
(
target
,
result
)
wer_sum
+=
wer
(
target
,
result
)
...
@@ -200,8 +137,15 @@ def tune():
...
@@ -200,8 +137,15 @@ def tune():
(
alpha
,
beta
,
wer_sum
/
num_ins
))
(
alpha
,
beta
,
wer_sum
/
num_ins
))
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
tune
()
tune
()
...
...
deep_speech_2/utils.py
已删除
100644 → 0
浏览文件 @
53e59ee7
"""Contains common utility functions."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
def
print_arguments
(
args
):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print
(
"----- Configuration Arguments -----"
)
for
arg
,
value
in
vars
(
args
).
iteritems
():
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------"
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录