Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
85d50214
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
85d50214
编写于
8月 08, 2021
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
reconstruct the exp/model.py and the model.export()
上级
31922865
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
43 addition
and
73 deletion
+43
-73
deepspeech/exps/deepspeech2/config.py
deepspeech/exps/deepspeech2/config.py
+1
-1
deepspeech/exps/deepspeech2/model.py
deepspeech/exps/deepspeech2/model.py
+21
-6
deepspeech/models/ds2_online/deepspeech2.py
deepspeech/models/ds2_online/deepspeech2.py
+21
-66
未找到文件。
deepspeech/exps/deepspeech2/config.py
浏览文件 @
85d50214
...
...
@@ -27,7 +27,7 @@ def get_cfg_defaults(model_type='offline'):
_C
.
collator
=
SpeechCollator
.
params
()
_C
.
training
=
DeepSpeech2Trainer
.
params
()
_C
.
decoding
=
DeepSpeech2Tester
.
params
()
if
(
model_type
==
'offline'
)
:
if
model_type
==
'offline'
:
_C
.
model
=
DeepSpeech2Model
.
params
()
else
:
_C
.
model
=
DeepSpeech2ModelOnline
.
params
()
...
...
deepspeech/exps/deepspeech2/model.py
浏览文件 @
85d50214
...
...
@@ -124,10 +124,23 @@ class DeepSpeech2Trainer(Trainer):
def
setup_model
(
self
):
config
=
self
.
config
if
hasattr
(
self
,
"train_loader"
):
config
.
defrost
()
config
.
model
.
feat_size
=
self
.
train_loader
.
collate_fn
.
feature_size
config
.
model
.
dict_size
=
self
.
train_loader
.
collate_fn
.
vocab_size
config
.
freeze
()
elif
hasattr
(
self
,
"test_loader"
):
config
.
defrost
()
config
.
model
.
feat_size
=
self
.
test_loader
.
collate_fn
.
feature_size
config
.
model
.
dict_size
=
self
.
test_loader
.
collate_fn
.
vocab_size
config
.
freeze
()
else
:
raise
Exception
(
"Please setup the dataloader first"
)
if
self
.
args
.
model_type
==
'offline'
:
model
=
DeepSpeech2Model
(
feat_size
=
self
.
train_loader
.
collate_fn
.
feature
_size
,
dict_size
=
self
.
train_loader
.
collate_fn
.
vocab
_size
,
feat_size
=
config
.
model
.
feat
_size
,
dict_size
=
config
.
model
.
dict
_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
...
...
@@ -135,8 +148,8 @@ class DeepSpeech2Trainer(Trainer):
share_rnn_weights
=
config
.
model
.
share_rnn_weights
)
elif
self
.
args
.
model_type
==
'online'
:
model
=
DeepSpeech2ModelOnline
(
feat_size
=
self
.
train_loader
.
collate_fn
.
feature
_size
,
dict_size
=
self
.
train_loader
.
collate_fn
.
vocab
_size
,
feat_size
=
config
.
model
.
feat
_size
,
dict_size
=
config
.
model
.
dict
_size
,
num_conv_layers
=
config
.
model
.
num_conv_layers
,
num_rnn_layers
=
config
.
model
.
num_rnn_layers
,
rnn_size
=
config
.
model
.
rnn_layer_size
,
...
...
@@ -209,6 +222,7 @@ class DeepSpeech2Trainer(Trainer):
batch_sampler
=
batch_sampler
,
collate_fn
=
collate_fn_train
,
num_workers
=
config
.
collator
.
num_workers
)
print
(
"feature_size"
,
self
.
train_loader
.
collate_fn
.
feature_size
)
self
.
valid_loader
=
DataLoader
(
dev_dataset
,
batch_size
=
config
.
collator
.
batch_size
,
...
...
@@ -368,8 +382,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
dtype
=
'int64'
),
# audio_length, [B]
])
elif
self
.
args
.
model_type
==
'online'
:
static_model
=
DeepSpeech2InferModelOnline
.
export
(
infer_model
,
feat_dim
)
static_model
=
infer_model
.
export
()
else
:
raise
Exception
(
"wrong model type"
)
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
...
...
@@ -395,6 +408,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self
.
iteration
=
0
self
.
epoch
=
0
'''
def setup_model(self):
config = self.config
if self.args.model_type == 'offline':
...
...
@@ -422,6 +436,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
self.model = model
logger.info("Setup model!")
'''
def
setup_dataloader
(
self
):
config
=
self
.
config
.
clone
()
...
...
deepspeech/models/ds2_online/deepspeech2.py
浏览文件 @
85d50214
...
...
@@ -88,55 +88,7 @@ class CRNNEncoder(nn.Layer):
def
output_size
(
self
):
return
self
.
fc_layers_size_list
[
-
1
]
def
forward
(
self
,
x
,
x_lens
):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T_input, D]
x_lens (Tensor): [B]
Returns:
x (Tensor): encoder outputs, [B, T_output, D]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size
final_state_c_box(Tensor): final_states c for RNN layers, num_rnn_layers * num_directions, batch_size, hidden_size
"""
# [B, T, D]
# convolution group
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
# convert data from convolution feature map to sequence of vectors
#B, C, D, T = paddle.shape(x) # not work under jit
#x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
#x = x.reshape([0, 0, -1]) #[B, T, C*D]
# remove padding part
init_state
=
None
final_state_list
=
[]
for
i
in
range
(
0
,
self
.
num_rnn_layers
):
x
,
final_state
=
self
.
rnn
[
i
](
x
,
init_state
,
x_lens
)
#[B, T, D]
final_state_list
.
append
(
final_state
)
x
=
self
.
layernorm_list
[
i
](
x
)
for
i
in
range
(
self
.
num_fc_layers
):
x
=
self
.
fc_layers_list
[
i
](
x
)
x
=
F
.
relu
(
x
)
if
self
.
use_gru
==
True
:
final_state_h_box
=
paddle
.
concat
(
final_state_list
,
axis
=
0
)
final_state_c_box
=
paddle
.
zeros_like
(
final_state_h_box
)
else
:
final_state_h_list
=
[
final_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_state_c_list
=
[
final_state_list
[
i
][
1
]
for
i
in
range
(
self
.
num_rnn_layers
)
]
final_state_h_box
=
paddle
.
concat
(
final_state_h_list
,
axis
=
0
)
final_state_c_box
=
paddle
.
concat
(
final_state_c_list
,
axis
=
0
)
return
x
,
x_lens
,
final_state_h_box
,
final_state_c_box
def
forward_chunk
(
self
,
x
,
x_lens
,
init_state_h_box
,
init_state_c_box
):
def
forward
(
self
,
x
,
x_lens
,
init_state_h_box
=
None
,
init_state_c_box
=
None
):
"""Compute Encoder outputs
Args:
...
...
@@ -152,13 +104,16 @@ class CRNNEncoder(nn.Layer):
"""
if
init_state_h_box
is
not
None
:
init_state_list
=
None
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
if
self
.
use_gru
==
True
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
init_state_h_list
else
:
init_state_h_list
=
paddle
.
split
(
init_state_h_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_c_list
=
paddle
.
split
(
init_state_c_box
,
self
.
num_rnn_layers
,
axis
=
0
)
init_state_list
=
[(
init_state_h_list
[
i
],
init_state_c_list
[
i
])
for
i
in
range
(
self
.
num_rnn_layers
)]
else
:
...
...
@@ -179,7 +134,7 @@ class CRNNEncoder(nn.Layer):
if
self
.
use_gru
==
True
:
final_chunk_state_h_box
=
paddle
.
concat
(
final_chunk_state_list
,
axis
=
0
)
final_chunk_state_c_box
=
paddle
.
zeros_like
(
final_chunk_state_h_box
)
final_chunk_state_c_box
=
init_state_c_box
#
paddle.zeros_like(final_chunk_state_h_box)
else
:
final_chunk_state_h_list
=
[
final_chunk_state_list
[
i
][
0
]
for
i
in
range
(
self
.
num_rnn_layers
)
...
...
@@ -242,13 +197,13 @@ class CRNNEncoder(nn.Layer):
x_chunk_lens
=
paddle
.
where
(
x_len_left
<
x_chunk_len_tmp
,
x_len_left
,
x_chunk_len_tmp
)
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
_chunk
(
eouts_chunk
,
eouts_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
=
self
.
forward
(
x_chunk
,
x_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
eouts_chunk_list
.
append
(
eouts_chunk
)
eouts_chunk_lens_list
.
append
(
eouts_chunk_lens
)
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
final_state_h_box
=
chunk_state_h_box
final_state_c_box
=
chunk_state_c_box
return
eouts_chunk_list
,
eouts_chunk_lens_list
,
final_state_h_box
,
final_state_c_box
...
...
@@ -297,7 +252,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
...
...
@@ -337,7 +292,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
loss (Tenosr): [1]
"""
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
)
audio
,
audio_len
,
None
,
None
)
loss
=
self
.
decoder
(
eouts
,
eouts_len
,
text
,
text_len
)
return
loss
...
...
@@ -355,7 +310,7 @@ class DeepSpeech2ModelOnline(nn.Layer):
decoding_method
=
decoding_method
)
eouts
,
eouts_len
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio
,
audio_len
)
audio
,
audio_len
,
None
,
None
)
probs
=
self
.
decoder
.
softmax
(
eouts
)
return
self
.
decoder
.
decode_probs
(
probs
.
numpy
(),
eouts_len
,
vocab_list
,
decoding_method
,
...
...
@@ -401,7 +356,7 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
feat_size
,
dict_size
,
num_conv_layers
=
2
,
num_rnn_layers
=
3
,
num_rnn_layers
=
4
,
rnn_size
=
1024
,
rnn_direction
=
'forward'
,
num_fc_layers
=
2
,
...
...
@@ -420,18 +375,18 @@ class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
):
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
.
forward_chunk
(
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
@
classmethod
def
export
(
self
,
infer_model
,
feat_dim
):
def
export
(
self
):
static_model
=
paddle
.
jit
.
to_static
(
infer_model
,
self
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
feat_dim
],
#[B, chunk_size, feat_dim]
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
# audio, [B,T,D]
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录