Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
274899d3
M
models
项目概览
PaddlePaddle
/
models
1 年多 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
274899d3
编写于
9月 04, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Reduce the config parsing codes for DS2 and make it looks cleaner.
上级
53e59ee7
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
415 addition
and
691 deletion
+415
-691
deep_speech_2/decoder.py
deep_speech_2/decoder.py
+9
-6
deep_speech_2/demo_server.py
deep_speech_2/demo_server.py
+75
-112
deep_speech_2/evaluate.py
deep_speech_2/evaluate.py
+78
-128
deep_speech_2/infer.py
deep_speech_2/infer.py
+77
-128
deep_speech_2/model.py
deep_speech_2/model.py
+9
-10
deep_speech_2/train.py
deep_speech_2/train.py
+88
-147
deep_speech_2/tune.py
deep_speech_2/tune.py
+79
-135
deep_speech_2/utils.py
deep_speech_2/utils.py
+0
-25
未找到文件。
deep_speech_2/decoder.py
浏览文件 @
274899d3
...
...
@@ -9,8 +9,9 @@ from math import log
import
multiprocessing
def
ctc_best_path_decoder
(
probs_seq
,
vocabulary
):
"""Best path decoder, also called argmax decoder or greedy decoder.
def
ctc_greedy_decoder
(
probs_seq
,
vocabulary
):
"""CTC greedy (best path) decoder.
Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks.
...
...
@@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_prob
=
1.0
,
ext_scoring_func
=
None
,
nproc
=
False
):
"""Beam search decoder for CTC-trained network. It utilizes beam search
to approximately select top best decoding labels and returning results
in the descending order. The implementation is based on Prefix
Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is
"""CTC Beam search decoder.
It utilizes beam search to approximately select top best decoding
labels and returning results in the descending order.
The implementation is based on Prefix Beam Search
(https://arxiv.org/abs/1408.2873), and the unclear part is
redesigned. Two important modifications: 1) in the iterative computation
of probabilities, the assignment operation is changed to accumulation for
one prefix may comes from different paths; 2) the if condition "if l^+ not
...
...
deep_speech_2/demo_server.py
浏览文件 @
274899d3
...
...
@@ -9,118 +9,74 @@ import SocketServer
import
struct
import
wave
import
paddle.v2
as
paddle
from
utils
import
print_arguments
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
data_utils.utils
import
read_manifest
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--host_ip"
,
default
=
"localhost"
,
type
=
str
,
help
=
"Server IP address. (default: %(default)s)"
)
parser
.
add_argument
(
"--host_port"
,
default
=
8086
,
type
=
int
,
help
=
"Server Port. (default: %(default)s)"
)
parser
.
add_argument
(
"--speech_save_dir"
,
default
=
"demo_cache"
,
type
=
str
,
help
=
"Directory for saving demo speech. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--warmup_manifest_path"
,
default
=
'datasets/manifest.test'
,
type
=
str
,
help
=
"Manifest path for warmup test. (default: %(default)s)"
)
parser
.
add_argument
(
"--specgram_type"
,
default
=
'linear'
,
type
=
str
,
help
=
"Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
2048
,
type
=
int
,
help
=
"RNN layer cell number. (default: %(default)s)"
)
parser
.
add_argument
(
"--share_rnn_weights"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--use_gru"
,
default
=
False
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_method"
,
default
=
'beam_search'
,
type
=
str
,
help
=
"Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
100
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha"
,
default
=
0.36
,
type
=
float
,
help
=
"Parameter associated with language model. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta"
,
default
=
0.25
,
type
=
float
,
help
=
"Parameter associated with word count. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
parser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
# yapf: disable
# configurations of overall
add_arg
(
'host_port'
,
int
,
8086
,
"Server's IP port."
)
add_arg
(
'host_ip'
,
str
,
'localhost'
,
"Server's IP address."
)
add_arg
(
'speech_save_dir'
,
str
,
'demo_cache'
,
"Directory to save demo audios."
)
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
# configurations of decoder
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
add_arg
(
'alpha'
,
float
,
0.36
,
"Coef of LM for beam search."
)
add_arg
(
'beta'
,
float
,
0.25
,
"Coef of WC for beam search."
)
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
add_arg
(
'lang_model_path'
,
str
,
'lm/data/common_crawl_00.prune01111.trie.klm'
,
"Filepath for language model."
)
add_arg
(
'decoder_method'
,
str
,
'ctc_beam_search'
,
"Decoder method. Options: ctc_beam_search, ctc_greedy"
,
choices
=
[
'ctc_beam_search'
,
'ctc_greedy'
])
# configurations of data preprocess
add_arg
(
'specgram_type'
,
str
,
'linear'
,
"Audio feature type. Options: linear, mfcc."
,
choices
=
[
'linear'
,
'mfcc'
])
# configurations of model structure
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
"bi-directional RNNs. Not for GRU."
)
# configurations of data io
add_arg
(
'warmup_manifest'
,
str
,
'datasets/manifest.test'
,
"Filepath of manifest to warm up."
)
add_arg
(
'mean_std_path'
,
str
,
'mean_std.npz'
,
"Filepath of normalizer's mean & std."
)
add_arg
(
'vocab_path'
,
str
,
'datasets/vocab/eng_vocab.txt'
,
"Filepath of vocabulary."
)
# configurations of model io
add_arg
(
'model_path'
,
str
,
'./checkpoints/params.latest.tar.gz'
,
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model."
)
args
=
parser
.
parse_args
()
# yapf: disable
class
AsrTCPServer
(
SocketServer
.
TCPServer
):
...
...
@@ -200,8 +156,8 @@ def start_server():
"""Start the ASR server"""
# prepare data generator
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
1
)
...
...
@@ -212,7 +168,7 @@ def start_server():
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
# prepare ASR inference handler
...
...
@@ -220,13 +176,13 @@ def start_server():
feature
=
data_generator
.
process_utterance
(
filename
,
""
)
result_transcript
=
ds2_model
.
infer_batch
(
infer_data
=
[
feature
],
decode
_method
=
args
.
decode
_method
,
decode
r_method
=
args
.
decoder
_method
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
language_model_path
=
args
.
lang_model_path
,
num_processes
=
1
)
return
result_transcript
[
0
]
...
...
@@ -235,7 +191,7 @@ def start_server():
print
(
'Warming up ...'
)
warm_up_test
(
audio_process_handler
=
file_to_transcript
,
manifest_path
=
args
.
warmup_manifest
_path
,
manifest_path
=
args
.
warmup_manifest
,
num_test_cases
=
3
)
print
(
'-----------------------------------------------------------'
)
...
...
@@ -249,6 +205,13 @@ def start_server():
server
.
serve_forever
()
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
1
)
...
...
deep_speech_2/evaluate.py
浏览文件 @
274899d3
...
...
@@ -10,140 +10,83 @@ import paddle.v2 as paddle
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
,
cer
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--batch_size"
,
default
=
128
,
type
=
int
,
help
=
"Minibatch size for evaluation. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
2048
,
type
=
int
,
help
=
"RNN layer cell number. (default: %(default)s)"
)
parser
.
add_argument
(
"--share_rnn_weights"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--use_gru"
,
default
=
False
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
multiprocessing
.
cpu_count
()
//
2
,
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_processes_beam_search"
,
default
=
multiprocessing
.
cpu_count
()
//
2
,
type
=
int
,
help
=
"Number of cpu processes for beam search. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_method"
,
default
=
'beam_search'
,
type
=
str
,
help
=
"Method for ctc decoding, best_path or beam_search. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha"
,
default
=
0.36
,
type
=
float
,
help
=
"Parameter associated with language model. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta"
,
default
=
0.25
,
type
=
float
,
help
=
"Parameter associated with word count. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--specgram_type"
,
default
=
'linear'
,
type
=
str
,
help
=
"Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_manifest_path"
,
default
=
'datasets/manifest.test'
,
type
=
str
,
help
=
"Manifest path for decoding. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--error_rate_type"
,
default
=
'wer'
,
choices
=
[
'wer'
,
'cer'
],
type
=
str
,
help
=
"Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)"
)
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
parser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
# yapf: disable
# configurations of overall
add_arg
(
'batch_size'
,
int
,
128
,
"Minibatch size."
)
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
add_arg
(
'error_rate_type'
,
str
,
'wer'
,
"Error rate type for evaluation."
,
choices
=
[
'wer'
,
'cer'
])
# configurations of decoder
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
add_arg
(
'alpha'
,
float
,
0.36
,
"Coef of LM for beam search."
)
add_arg
(
'beta'
,
float
,
0.25
,
"Coef of WC for beam search."
)
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
add_arg
(
'parallels_bsearch'
,
int
,
NUM_CPU
,
"# of CPUs for beam search."
)
add_arg
(
'lang_model_path'
,
str
,
'lm/data/common_crawl_00.prune01111.trie.klm'
,
"Filepath for language model."
)
add_arg
(
'decoder_method'
,
str
,
'ctc_beam_search'
,
"Decoder method. Options: ctc_beam_search, ctc_greedy"
,
choices
=
[
'ctc_beam_search'
,
'ctc_greedy'
])
# configurations of data preprocess
add_arg
(
'parallels_data'
,
int
,
NUM_CPU
,
"# of CPUs for data preprocessing."
)
add_arg
(
'specgram_type'
,
str
,
'linear'
,
"Audio feature type. Options: linear, mfcc."
,
choices
=
[
'linear'
,
'mfcc'
])
# configurations of model structure
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
"bi-directional RNNs. Not for GRU."
)
# configurations of data io
add_arg
(
'test_manifest'
,
str
,
'datasets/manifest.test'
,
"Filepath of manifest to evaluate."
)
add_arg
(
'mean_std_path'
,
str
,
'mean_std.npz'
,
"Filepath of normalizer's mean & std."
)
add_arg
(
'vocab_path'
,
str
,
'datasets/vocab/eng_vocab.txt'
,
"Filepath of vocabulary."
)
# configurations of model io
add_arg
(
'model_path'
,
str
,
'./checkpoints/params.latest.tar.gz'
,
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model."
)
args
=
parser
.
parse_args
()
# yapf: disable
def
evaluate
():
"""Evaluate on whole test data for DeepSpeech2."""
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_thread
s_data
)
num_threads
=
args
.
parallel
s_data
)
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
manifest_path
=
args
.
test_manifest
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
1
,
sortagrad
=
False
,
...
...
@@ -155,7 +98,7 @@ def evaluate():
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
...
...
@@ -163,14 +106,14 @@ def evaluate():
for
infer_data
in
batch_reader
():
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
infer_data
,
decode
_method
=
args
.
decode
_method
,
decode
r_method
=
args
.
decoder
_method
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
num_processes
=
args
.
num_processes_beam_
search
)
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
parallels_b
search
)
target_transcripts
=
[
''
.
join
([
data_generator
.
vocab_list
[
token
]
for
token
in
transcript
])
for
_
,
transcript
in
infer_data
...
...
@@ -184,8 +127,15 @@ def evaluate():
(
args
.
error_rate_type
,
num_ins
,
num_ins
,
error_sum
/
num_ins
))
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
evaluate
()
...
...
deep_speech_2/infer.py
浏览文件 @
274899d3
...
...
@@ -10,140 +10,82 @@ import paddle.v2 as paddle
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
,
cer
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--num_samples"
,
default
=
10
,
type
=
int
,
help
=
"Number of samples for inference. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
2048
,
type
=
int
,
help
=
"RNN layer cell number. (default: %(default)s)"
)
parser
.
add_argument
(
"--share_rnn_weights"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--use_gru"
,
default
=
False
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
1
,
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_processes_beam_search"
,
default
=
multiprocessing
.
cpu_count
()
//
2
,
type
=
int
,
help
=
"Number of cpu processes for beam search. (default: %(default)s)"
)
parser
.
add_argument
(
"--specgram_type"
,
default
=
'linear'
,
type
=
str
,
help
=
"Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_manifest_path"
,
default
=
'datasets/manifest.test'
,
type
=
str
,
help
=
"Manifest path for decoding. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--decode_method"
,
default
=
'beam_search'
,
type
=
str
,
help
=
"Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha"
,
default
=
0.36
,
type
=
float
,
help
=
"Parameter associated with language model. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta"
,
default
=
0.25
,
type
=
float
,
help
=
"Parameter associated with word count. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
parser
.
add_argument
(
"--error_rate_type"
,
default
=
'wer'
,
choices
=
[
'wer'
,
'cer'
],
type
=
str
,
help
=
"Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)"
)
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
parser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
# yapf: disable
# configurations of overall
add_arg
(
'num_samples'
,
int
,
10
,
"# of samples to infer."
)
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
add_arg
(
'error_rate_type'
,
str
,
'wer'
,
"Error rate type for evaluation."
,
choices
=
[
'wer'
,
'cer'
])
# configurations of decoder
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
add_arg
(
'alpha'
,
float
,
0.36
,
"Coef of LM for beam search."
)
add_arg
(
'beta'
,
float
,
0.25
,
"Coef of WC for beam search."
)
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
add_arg
(
'parallels_bsearch'
,
int
,
NUM_CPU
,
"# of CPUs for beam search."
)
add_arg
(
'lang_model_path'
,
str
,
'lm/data/common_crawl_00.prune01111.trie.klm'
,
"Filepath for language model."
)
add_arg
(
'decoder_method'
,
str
,
'ctc_beam_search'
,
"Decoder method. Options: ctc_beam_search, ctc_greedy"
,
choices
=
[
'ctc_beam_search'
,
'ctc_greedy'
])
# configurations of data preprocess
add_arg
(
'specgram_type'
,
str
,
'linear'
,
"Audio feature type. Options: linear, mfcc."
,
choices
=
[
'linear'
,
'mfcc'
])
# configurations of model structure
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
"bi-directional RNNs. Not for GRU."
)
# configurations of data io
add_arg
(
'infer_manifest'
,
str
,
'datasets/manifest.dev'
,
"Filepath of manifest to infer."
)
add_arg
(
'mean_std_path'
,
str
,
'mean_std.npz'
,
"Filepath of normalizer's mean & std."
)
add_arg
(
'vocab_path'
,
str
,
'datasets/vocab/eng_vocab.txt'
,
"Filepath of vocabulary."
)
# configurations of model io
add_arg
(
'model_path'
,
str
,
'./checkpoints/params.latest.tar.gz'
,
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model."
)
args
=
parser
.
parse_args
()
# yapf: disable
def
infer
():
"""Inference for DeepSpeech2."""
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
num_threads
=
1
)
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
decode_manifest_path
,
manifest_path
=
args
.
infer_manifest
,
batch_size
=
args
.
num_samples
,
min_batch_size
=
1
,
sortagrad
=
False
,
...
...
@@ -156,18 +98,18 @@ def infer():
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
infer_data
,
decode
_method
=
args
.
decode
_method
,
decode
r_method
=
args
.
decoder
_method
,
beam_alpha
=
args
.
alpha
,
beam_beta
=
args
.
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
num_processes
=
args
.
num_processes_beam_
search
)
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
parallels_b
search
)
error_rate_func
=
cer
if
args
.
error_rate_type
==
'cer'
else
wer
target_transcripts
=
[
...
...
@@ -181,8 +123,15 @@ def infer():
(
args
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
infer
()
...
...
deep_speech_2/model.py
浏览文件 @
274899d3
...
...
@@ -146,7 +146,7 @@ class DeepSpeech2Model(object):
# run inference
return
self
.
_loss_inferer
.
infer
(
input
=
infer_data
)
def
infer_batch
(
self
,
infer_data
,
decode_method
,
beam_alpha
,
beam_beta
,
def
infer_batch
(
self
,
infer_data
,
decode
r
_method
,
beam_alpha
,
beam_beta
,
beam_size
,
cutoff_prob
,
vocab_list
,
language_model_path
,
num_processes
):
"""Model inference. Infer the transcription for a batch of speech
...
...
@@ -156,9 +156,9 @@ class DeepSpeech2Model(object):
consisting of a tuple of audio features and
transcription text (empty string).
:type infer_data: list
:param decode
_method: Decoding method name, 'best_path
' or
'beam
search'.
:param decode_method: string
:param decode
r_method: Decoding method name, 'ctc_greedy
' or
'ctc_beam_
search'.
:param decode
r
_method: string
:param beam_alpha: Parameter associated with language model.
:type beam_alpha: float
:param beam_beta: Parameter associated with word count.
...
...
@@ -190,13 +190,13 @@ class DeepSpeech2Model(object):
]
# run decoder
results
=
[]
if
decode
_method
==
"best_path
"
:
if
decode
r_method
==
"ctc_greedy
"
:
# best path decode
for
i
,
probs
in
enumerate
(
probs_split
):
output_transcription
=
ctc_
best_path
_decoder
(
output_transcription
=
ctc_
greedy
_decoder
(
probs_seq
=
probs
,
vocabulary
=
vocab_list
)
results
.
append
(
output_transcription
)
elif
decode
_method
==
"
beam_search"
:
elif
decode
r_method
==
"ctc_
beam_search"
:
# initialize external scorer
if
self
.
_ext_scorer
==
None
:
self
.
_ext_scorer
=
LmScorer
(
beam_alpha
,
beam_beta
,
...
...
@@ -205,7 +205,6 @@ class DeepSpeech2Model(object):
else
:
self
.
_ext_scorer
.
reset_params
(
beam_alpha
,
beam_beta
)
assert
self
.
_loaded_lm_path
==
language_model_path
# beam search decode
beam_search_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
...
...
@@ -218,8 +217,8 @@ class DeepSpeech2Model(object):
results
=
[
result
[
0
][
1
]
for
result
in
beam_search_results
]
else
:
raise
ValueError
(
"Decod
ing
method [%s] is not supported."
%
decode_method
)
raise
ValueError
(
"Decod
er
method [%s] is not supported."
%
decode
r
_method
)
return
results
def
_create_parameters
(
self
,
model_path
=
None
):
...
...
deep_speech_2/train.py
浏览文件 @
274899d3
...
...
@@ -9,169 +9,103 @@ import multiprocessing
import
paddle.v2
as
paddle
from
model
import
DeepSpeech2Model
from
data_utils.data
import
DataGenerator
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--batch_size"
,
default
=
256
,
type
=
int
,
help
=
"Minibatch size."
)
parser
.
add_argument
(
"--num_passes"
,
default
=
200
,
type
=
int
,
help
=
"Training pass number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_iterations_print"
,
default
=
100
,
type
=
int
,
help
=
"Number of iterations for every train cost printing. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
2048
,
type
=
int
,
help
=
"RNN layer cell number. (default: %(default)s)"
)
parser
.
add_argument
(
"--share_rnn_weights"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--use_gru"
,
default
=
False
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
parser
.
add_argument
(
"--adam_learning_rate"
,
default
=
5e-4
,
type
=
float
,
help
=
"Learning rate for ADAM Optimizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_sortagrad"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use sortagrad or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--specgram_type"
,
default
=
'linear'
,
type
=
str
,
help
=
"Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)"
)
parser
.
add_argument
(
"--max_duration"
,
default
=
27.0
,
type
=
float
,
help
=
"Audios with duration larger than this will be discarded. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--min_duration"
,
default
=
0.0
,
type
=
float
,
help
=
"Audios with duration smaller than this will be discarded. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--shuffle_method"
,
default
=
'batch_shuffle_clipped'
,
type
=
str
,
help
=
"Shuffle method: 'instance_shuffle', 'batch_shuffle', "
"'batch_shuffle_batch'. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
multiprocessing
.
cpu_count
()
//
2
,
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--train_manifest_path"
,
default
=
'datasets/manifest.train'
,
type
=
str
,
help
=
"Manifest path for training. (default: %(default)s)"
)
parser
.
add_argument
(
"--dev_manifest_path"
,
default
=
'datasets/manifest.dev'
,
type
=
str
,
help
=
"Manifest path for validation. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--init_model_path"
,
default
=
None
,
type
=
str
,
help
=
"If set None, the training will start from scratch. "
"Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)"
)
parser
.
add_argument
(
"--output_model_dir"
,
default
=
"./checkpoints"
,
type
=
str
,
help
=
"Directory for saving models. (default: %(default)s)"
)
parser
.
add_argument
(
"--augmentation_config"
,
default
=
open
(
'conf/augmentation.config'
,
'r'
).
read
(),
type
=
str
,
help
=
"Augmentation configuration in json-format. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--is_local"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)"
)
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
parser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
# yapf: disable
# configurations of optimization
add_arg
(
'batch_size'
,
int
,
256
,
"Minibatch size."
)
add_arg
(
'learning_rate'
,
float
,
5e-4
,
"Learning rate."
)
add_arg
(
'use_sortagrad'
,
bool
,
True
,
"Use SortaGrad or not."
)
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
add_arg
(
'num_passes'
,
int
,
200
,
"# of training epochs."
)
add_arg
(
'is_local'
,
bool
,
True
,
"Use pserver or not."
)
add_arg
(
'num_iter_print'
,
int
,
100
,
"Every # iterations for printing "
"train cost."
)
# configurations of data preprocess
add_arg
(
'max_duration'
,
float
,
27.0
,
"Longest audio duration allowed."
)
add_arg
(
'min_duration'
,
float
,
0.0
,
"Shortest audio duration allowed."
)
add_arg
(
'parallels_data'
,
int
,
NUM_CPU
,
"# of CPUs for data preprocessing."
)
add_arg
(
'specgram_type'
,
str
,
'linear'
,
"Audio feature type. Options: linear, mfcc."
,
choices
=
[
'linear'
,
'mfcc'
])
add_arg
(
'augment_conf_path'
,
str
,
'conf/augmentation.config'
,
"Filepath of augmentation configuration file (json-format)."
)
add_arg
(
'shuffle_method'
,
str
,
'batch_shuffle_clipped'
,
"Shuffle method."
,
choices
=
[
'instance_shuffle'
,
'batch_shuffle'
,
'batch_shuffle_clipped'
])
# configurations of model structure
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
"bi-directional RNNs. Not for GRU."
)
# configurations of data io
add_arg
(
'train_manifest'
,
str
,
'datasets/manifest.train'
,
"Filepath of train manifest."
)
add_arg
(
'dev_manifest'
,
str
,
'datasets/manifest.dev'
,
"Filepath of validation manifest."
)
add_arg
(
'mean_std_path'
,
str
,
'mean_std.npz'
,
"Filepath of normalizer's mean & std."
)
add_arg
(
'vocab_path'
,
str
,
'datasets/vocab/eng_vocab.txt'
,
"Filepath of vocabulary."
)
# configurations of model io
add_arg
(
'init_model_path'
,
str
,
None
,
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model."
)
add_arg
(
'output_model_dir'
,
str
,
"./checkpoints"
,
"Directory for saving checkpoints."
)
args
=
parser
.
parse_args
()
# yapf: disable
def
train
():
"""DeepSpeech2 training."""
train_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
augmentation_config
=
args
.
augmentation_config
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
open
(
args
.
augment_conf_path
,
'r'
).
read
()
,
max_duration
=
args
.
max_duration
,
min_duration
=
args
.
min_duration
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_thread
s_data
)
num_threads
=
args
.
parallel
s_data
)
dev_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
"{}"
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_thread
s_data
)
num_threads
=
args
.
parallel
s_data
)
train_batch_reader
=
train_generator
.
batch_reader_creator
(
manifest_path
=
args
.
train_manifest
_path
,
manifest_path
=
args
.
train_manifest
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
args
.
trainer_count
,
sortagrad
=
args
.
use_sortagrad
if
args
.
init_model_path
is
None
else
False
,
shuffle_method
=
args
.
shuffle_method
)
dev_batch_reader
=
dev_generator
.
batch_reader_creator
(
manifest_path
=
args
.
dev_manifest
_path
,
manifest_path
=
args
.
dev_manifest
,
batch_size
=
args
.
batch_size
,
min_batch_size
=
1
,
# must be 1, but will have errors.
sortagrad
=
False
,
...
...
@@ -184,21 +118,28 @@ def train():
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
init_model_path
,
share_rnn_weights
=
args
.
share_
rnn_
weights
)
share_rnn_weights
=
args
.
share_weights
)
ds2_model
.
train
(
train_batch_reader
=
train_batch_reader
,
dev_batch_reader
=
dev_batch_reader
,
feeding_dict
=
train_generator
.
feeding
,
learning_rate
=
args
.
adam_
learning_rate
,
learning_rate
=
args
.
learning_rate
,
gradient_clipping
=
400
,
num_passes
=
args
.
num_passes
,
num_iterations_print
=
args
.
num_iter
ations
_print
,
num_iterations_print
=
args
.
num_iter_print
,
output_model_dir
=
args
.
output_model_dir
,
is_local
=
args
.
is_local
)
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
train
()
...
...
deep_speech_2/tune.py
浏览文件 @
274899d3
"""
P
arameters tuning for DeepSpeech2 model."""
"""
Beam search p
arameters tuning for DeepSpeech2 model."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
...
...
@@ -11,134 +11,71 @@ import paddle.v2 as paddle
from
data_utils.data
import
DataGenerator
from
model
import
DeepSpeech2Model
from
error_rate
import
wer
import
utils
NUM_CPU
=
multiprocessing
.
cpu_count
()
//
2
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
"--num_samples"
,
default
=
100
,
type
=
int
,
help
=
"Number of samples for parameters tuning. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_conv_layers"
,
default
=
2
,
type
=
int
,
help
=
"Convolution layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_rnn_layers"
,
default
=
3
,
type
=
int
,
help
=
"RNN layer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--rnn_layer_size"
,
default
=
2048
,
type
=
int
,
help
=
"RNN layer cell number. (default: %(default)s)"
)
parser
.
add_argument
(
"--share_rnn_weights"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--use_gru"
,
default
=
False
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use GRU or simple RNN. (default: %(default)s)"
)
parser
.
add_argument
(
"--use_gpu"
,
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--trainer_count"
,
default
=
8
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
1
,
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_processes_beam_search"
,
default
=
multiprocessing
.
cpu_count
()
//
2
,
type
=
int
,
help
=
"Number of cpu processes for beam search. (default: %(default)s)"
)
parser
.
add_argument
(
"--specgram_type"
,
default
=
'linear'
,
type
=
str
,
help
=
"Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
type
=
str
,
help
=
"Manifest path for normalizer. (default: %(default)s)"
)
parser
.
add_argument
(
"--tune_manifest_path"
,
default
=
'datasets/manifest.dev'
,
type
=
str
,
help
=
"Manifest path for tuning. (default: %(default)s)"
)
parser
.
add_argument
(
"--model_filepath"
,
default
=
'checkpoints/params.latest.tar.gz'
,
type
=
str
,
help
=
"Model filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--vocab_filepath"
,
default
=
'datasets/vocab/eng_vocab.txt'
,
type
=
str
,
help
=
"Vocabulary filepath. (default: %(default)s)"
)
parser
.
add_argument
(
"--beam_size"
,
default
=
500
,
type
=
int
,
help
=
"Width for beam search decoding. (default: %(default)d)"
)
parser
.
add_argument
(
"--language_model_path"
,
default
=
"lm/data/common_crawl_00.prune01111.trie.klm"
,
type
=
str
,
help
=
"Path for language model. (default: %(default)s)"
)
parser
.
add_argument
(
"--alpha_from"
,
default
=
0.1
,
type
=
float
,
help
=
"Where alpha starts from. (default: %(default)f)"
)
parser
.
add_argument
(
"--num_alphas"
,
default
=
14
,
type
=
int
,
help
=
"Number of candidate alphas. (default: %(default)d)"
)
parser
.
add_argument
(
"--alpha_to"
,
default
=
0.36
,
type
=
float
,
help
=
"Where alpha ends with. (default: %(default)f)"
)
parser
.
add_argument
(
"--beta_from"
,
default
=
0.05
,
type
=
float
,
help
=
"Where beta starts from. (default: %(default)f)"
)
parser
.
add_argument
(
"--num_betas"
,
default
=
20
,
type
=
float
,
help
=
"Number of candidate betas. (default: %(default)d)"
)
parser
.
add_argument
(
"--beta_to"
,
default
=
1.0
,
type
=
float
,
help
=
"Where beta ends with. (default: %(default)f)"
)
parser
.
add_argument
(
"--cutoff_prob"
,
default
=
0.99
,
type
=
float
,
help
=
"The cutoff probability of pruning"
"in beam search. (default: %(default)f)"
)
def
add_arg
(
argname
,
type
,
default
,
help
,
**
kwargs
):
type
=
distutils
.
util
.
strtobool
if
type
==
bool
else
type
parser
.
add_argument
(
"--"
+
argname
,
default
=
default
,
type
=
type
,
help
=
help
+
' Default: %(default)s.'
,
**
kwargs
)
# yapf: disable
# configurations of overall
add_arg
(
'num_samples'
,
int
,
100
,
"# of samples to infer."
)
add_arg
(
'trainer_count'
,
int
,
8
,
"# of Trainers (CPUs or GPUs)."
)
add_arg
(
'use_gpu'
,
bool
,
True
,
"Use GPU or not."
)
add_arg
(
'error_rate_type'
,
str
,
'wer'
,
"Error rate type for evaluation."
,
choices
=
[
'wer'
,
'cer'
])
# configurations of tuning parameters
add_arg
(
'alpha_from'
,
float
,
0.1
,
"Where alpha starts tuning from."
)
add_arg
(
'alpha_to'
,
float
,
0.36
,
"Where alpha ends tuning with."
)
add_arg
(
'num_alphas'
,
int
,
14
,
"# of alpha candidates for tuning."
)
add_arg
(
'beta_from'
,
float
,
0.05
,
"Where beta starts tuning from."
)
add_arg
(
'beta_to'
,
float
,
0.36
,
"Where beta ends tuning with."
)
add_arg
(
'num_betas'
,
int
,
20
,
"# of beta candidates for tuning."
)
# configurations of decoder
add_arg
(
'beam_size'
,
int
,
500
,
"Beam search width."
)
add_arg
(
'cutoff_prob'
,
float
,
0.99
,
"Cutoff probability for pruning."
)
add_arg
(
'parallels_bsearch'
,
int
,
NUM_CPU
,
"# of CPUs for beam search."
)
add_arg
(
'lang_model_path'
,
str
,
'lm/data/common_crawl_00.prune01111.trie.klm'
,
"Filepath for language model."
)
# configurations of data preprocess
add_arg
(
'specgram_type'
,
str
,
'linear'
,
"Audio feature type. Options: linear, mfcc."
,
choices
=
[
'linear'
,
'mfcc'
])
# configurations of model structure
add_arg
(
'num_conv_layers'
,
int
,
2
,
"# of convolution layers."
)
add_arg
(
'num_rnn_layers'
,
int
,
3
,
"# of recurrent layers."
)
add_arg
(
'rnn_layer_size'
,
int
,
2048
,
"# of recurrent cells per layer."
)
add_arg
(
'use_gru'
,
bool
,
False
,
"Use GRUs instead of Simple RNNs."
)
add_arg
(
'share_rnn_weights'
,
bool
,
True
,
"Share input-hidden weights across "
"bi-directional RNNs. Not for GRU."
)
# configurations of data io
add_arg
(
'tune_manifest'
,
str
,
'datasets/manifest.test'
,
"Filepath of manifest to tune."
)
add_arg
(
'mean_std_path'
,
str
,
'mean_std.npz'
,
"Filepath of normalizer's mean & std."
)
add_arg
(
'vocab_path'
,
str
,
'datasets/vocab/eng_vocab.txt'
,
"Filepath of vocabulary."
)
# configurations of model io
add_arg
(
'model_path'
,
str
,
'./checkpoints/params.latest.tar.gz'
,
"If None, the training starts from scratch, "
"otherwise, it resumes from the pre-trained model."
)
args
=
parser
.
parse_args
()
# yapf: disable
def
tune
():
...
...
@@ -149,13 +86,13 @@ def tune():
raise
ValueError
(
"num_betas must be non-negative!"
)
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_
file
path
,
mean_std_filepath
=
args
.
mean_std_
file
path
,
vocab_filepath
=
args
.
vocab_path
,
mean_std_filepath
=
args
.
mean_std_path
,
augmentation_config
=
'{}'
,
specgram_type
=
args
.
specgram_type
,
num_threads
=
args
.
num_threads_data
)
num_threads
=
1
)
batch_reader
=
data_generator
.
batch_reader_creator
(
manifest_path
=
args
.
tune_manifest
_path
,
manifest_path
=
args
.
tune_manifest
,
batch_size
=
args
.
num_samples
,
sortagrad
=
False
,
shuffle_method
=
None
)
...
...
@@ -171,7 +108,7 @@ def tune():
num_rnn_layers
=
args
.
num_rnn_layers
,
rnn_layer_size
=
args
.
rnn_layer_size
,
use_gru
=
args
.
use_gru
,
pretrained_model_path
=
args
.
model_
file
path
,
pretrained_model_path
=
args
.
model_path
,
share_rnn_weights
=
args
.
share_rnn_weights
)
# create grid for search
...
...
@@ -184,14 +121,14 @@ def tune():
for
alpha
,
beta
in
params_grid
:
result_transcripts
=
ds2_model
.
infer_batch
(
infer_data
=
tune_data
,
decode
_method
=
'
beam_search'
,
decode
r_method
=
'ctc_
beam_search'
,
beam_alpha
=
alpha
,
beam_beta
=
beta
,
beam_size
=
args
.
beam_size
,
cutoff_prob
=
args
.
cutoff_prob
,
vocab_list
=
data_generator
.
vocab_list
,
language_model_path
=
args
.
lang
uage
_model_path
,
num_processes
=
args
.
num_processes_beam_
search
)
language_model_path
=
args
.
lang_model_path
,
num_processes
=
args
.
parallels_b
search
)
wer_sum
,
num_ins
=
0.0
,
0
for
target
,
result
in
zip
(
target_transcripts
,
result_transcripts
):
wer_sum
+=
wer
(
target
,
result
)
...
...
@@ -200,8 +137,15 @@ def tune():
(
alpha
,
beta
,
wer_sum
/
num_ins
))
def
print_arguments
(
args
):
print
(
"----------- Configuration Arguments -----------"
)
for
arg
,
value
in
sorted
(
vars
(
args
).
iteritems
()):
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------------------"
)
def
main
():
utils
.
print_arguments
(
args
)
print_arguments
(
args
)
paddle
.
init
(
use_gpu
=
args
.
use_gpu
,
trainer_count
=
args
.
trainer_count
)
tune
()
...
...
deep_speech_2/utils.py
已删除
100644 → 0
浏览文件 @
53e59ee7
"""Contains common utility functions."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
def
print_arguments
(
args
):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print
(
"----- Configuration Arguments -----"
)
for
arg
,
value
in
vars
(
args
).
iteritems
():
print
(
"%s: %s"
%
(
arg
,
value
))
print
(
"------------------------------------"
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录