Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
e4ef8ed3
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e4ef8ed3
编写于
7月 29, 2021
作者:
H
huangyuxin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add the subsampling as conv
上级
87163864
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
70 addition
and
195 deletion
+70
-195
deepspeech/models/ds2/__init__.py
deepspeech/models/ds2/__init__.py
+14
-1
deepspeech/models/ds2/deepspeech2.py
deepspeech/models/ds2/deepspeech2.py
+1
-1
deepspeech/models/ds2_online/__init__.py
deepspeech/models/ds2_online/__init__.py
+14
-4
deepspeech/models/ds2_online/conv.py
deepspeech/models/ds2_online/conv.py
+17
-152
deepspeech/models/ds2_online/deepspeech2.py
deepspeech/models/ds2_online/deepspeech2.py
+21
-31
examples/aishell/s0/conf/deepspeech2.yaml
examples/aishell/s0/conf/deepspeech2.yaml
+1
-2
examples/librispeech/s0/conf/deepspeech2.yaml
examples/librispeech/s0/conf/deepspeech2.yaml
+0
-1
examples/tiny/s0/conf/deepspeech2.yaml
examples/tiny/s0/conf/deepspeech2.yaml
+0
-1
tests/deepspeech2_model_test.py
tests/deepspeech2_model_test.py
+2
-2
未找到文件。
deepspeech/models/ds2/__init__.py
浏览文件 @
e4ef8ed3
from
.deepspeech2
import
DeepSpeech2Model
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.deepspeech2
import
DeepSpeech2InferModel
from
.deepspeech2
import
DeepSpeech2Model
__all__
=
[
'DeepSpeech2Model'
,
'DeepSpeech2InferModel'
]
deepspeech/models/ds2/deepspeech2.py
浏览文件 @
e4ef8ed3
...
...
@@ -19,8 +19,8 @@ from paddle import nn
from
yacs.config
import
CfgNode
from
deepspeech.models.ds2.conv
import
ConvStack
from
deepspeech.modules.ctc
import
CTCDecoder
from
deepspeech.models.ds2.rnn
import
RNNStack
from
deepspeech.modules.ctc
import
CTCDecoder
from
deepspeech.utils
import
layer_tools
from
deepspeech.utils.checkpoint
import
Checkpoint
from
deepspeech.utils.log
import
Log
...
...
deepspeech/models/ds2_online/__init__.py
浏览文件 @
e4ef8ed3
from
.deepspeech2
import
DeepSpeech2ModelOnline
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.deepspeech2
import
DeepSpeech2InferModelOnline
from
.deepspeech2
import
DeepSpeech2ModelOnline
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModelOnline'
]
deepspeech/models/ds2_online/conv.py
浏览文件 @
e4ef8ed3
...
...
@@ -11,162 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
deepspeech.modules.activation
import
brelu
from
deepspeech.modules.mask
import
make_non_pad_mask
from
deepspeech.utils.log
import
Log
from
deepspeech.modules.embedding
import
PositionalEncoding
from
deepspeech.modules.subsampling
import
Conv2dSubsampling4
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'ConvStack'
,
"conv_output_size"
]
class
Conv2dSubsampling4Online
(
Conv2dSubsampling4
):
def
__init__
(
self
,
idim
:
int
,
odim
:
int
,
dropout_rate
:
float
,
pos_enc_class
:
nn
.
Layer
=
PositionalEncoding
):
super
().
__init__
(
idim
,
odim
,
dropout_rate
,
pos_enc_class
)
self
.
output_dim
=
((
idim
-
1
)
//
2
-
1
)
//
2
*
odim
def
conv_output_size
(
I
,
F
,
P
,
S
):
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# Output size after Conv:
# By noting I the length of the input volume size,
# F the length of the filter,
# P the amount of zero padding,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
# When Pstart == Pend == 0
# O = (I - F - S) // S
# https://iq.opengenus.org/output-size-of-convolution/
# Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
# Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
return
(
I
-
F
+
2
*
P
-
S
)
//
S
# receptive field calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
class
ConvBn
(
nn
.
Layer
):
"""Convolution layer with batch normalization.
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type kernel_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type, relu|brelu
:type act: string
:return: Batch norm layer after convolution layer.
:rtype: Variable
"""
def
__init__
(
self
,
num_channels_in
,
num_channels_out
,
kernel_size
,
stride
,
padding
,
act
):
super
().
__init__
()
assert
len
(
kernel_size
)
==
2
assert
len
(
stride
)
==
2
assert
len
(
padding
)
==
2
self
.
kernel_size
=
kernel_size
self
.
stride
=
stride
self
.
padding
=
padding
self
.
conv
=
nn
.
Conv2D
(
num_channels_in
,
num_channels_out
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
,
weight_attr
=
None
,
bias_attr
=
False
,
data_format
=
'NCHW'
)
self
.
bn
=
nn
.
BatchNorm2D
(
num_channels_out
,
weight_attr
=
None
,
bias_attr
=
None
,
data_format
=
'NCHW'
)
self
.
act
=
F
.
relu
if
act
==
'relu'
else
brelu
def
forward
(
self
,
x
,
x_len
):
"""
x(Tensor): audio, shape [B, C, D, T]
"""
def
forward
(
self
,
x
:
paddle
.
Tensor
,
x_len
:
paddle
.
Tensor
)
->
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
x
=
x
.
unsqueeze
(
1
)
# (b, c=1, t, f)
x
=
self
.
conv
(
x
)
x
=
self
.
bn
(
x
)
x
=
self
.
act
(
x
)
x_len
=
(
x_len
-
self
.
kernel_size
[
1
]
+
2
*
self
.
padding
[
1
]
)
//
self
.
stride
[
1
]
+
1
# reset padding part to 0
masks
=
make_non_pad_mask
(
x_len
)
#[B, T]
masks
=
masks
.
unsqueeze
(
1
).
unsqueeze
(
1
)
# [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks
=
masks
.
astype
(
x
.
dtype
)
x
=
x
.
multiply
(
masks
)
return
x
,
x_len
class
ConvStack
(
nn
.
Layer
):
"""Convolution group with stacked convolution layers.
:param feat_size: audio feature dim.
:type feat_size: int
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
"""
def
__init__
(
self
,
feat_size
,
num_stacks
):
super
().
__init__
()
self
.
feat_size
=
feat_size
# D
self
.
num_stacks
=
num_stacks
self
.
conv_in
=
ConvBn
(
num_channels_in
=
1
,
num_channels_out
=
32
,
kernel_size
=
(
41
,
11
),
#[D, T]
stride
=
(
2
,
3
),
padding
=
(
20
,
5
),
act
=
'brelu'
)
out_channel
=
32
convs
=
[
ConvBn
(
num_channels_in
=
32
,
num_channels_out
=
out_channel
,
kernel_size
=
(
21
,
11
),
stride
=
(
2
,
1
),
padding
=
(
10
,
5
),
act
=
'brelu'
)
for
i
in
range
(
num_stacks
-
1
)
]
self
.
conv_stack
=
nn
.
LayerList
(
convs
)
# conv output feat_dim
output_height
=
(
feat_size
-
1
)
//
2
+
1
for
i
in
range
(
self
.
num_stacks
-
1
):
output_height
=
(
output_height
-
1
)
//
2
+
1
self
.
output_height
=
out_channel
*
output_height
def
forward
(
self
,
x
,
x_len
):
"""
x: shape [B, C, D, T]
x_len : shape [B]
"""
x
,
x_len
=
self
.
conv_in
(
x
,
x_len
)
for
i
,
conv
in
enumerate
(
self
.
conv_stack
):
x
,
x_len
=
conv
(
x
,
x_len
)
b
,
c
,
t
,
f
=
paddle
.
shape
(
x
)
x
=
x
.
transpose
([
0
,
2
,
1
,
3
]).
reshape
([
b
,
t
,
c
*
f
])
x_len
=
((
x_len
-
1
)
//
2
-
1
)
//
2
return
x
,
x_len
deepspeech/models/ds2_online/deepspeech2.py
浏览文件 @
e4ef8ed3
...
...
@@ -11,27 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deepspeech2 ASR Model"""
"""Deepspeech2 ASR
Online
Model"""
from
typing
import
Optional
import
paddle
import
paddle.nn.functional
as
F
from
paddle
import
nn
from
paddle.fluid.layers
import
fc
from
paddle.nn
import
GRU
from
paddle.nn
import
LayerList
from
paddle.nn
import
LayerNorm
from
paddle.nn
import
Linear
from
paddle.nn
import
LSTM
from
yacs.config
import
CfgNode
from
deepspeech.models.ds2_online.conv
import
ConvStack
from
deepspeech.models.ds2_online.rnn
import
RNNStack
from
deepspeech.models.ds2_online.conv
import
Conv2dSubsampling4Online
from
deepspeech.modules.ctc
import
CTCDecoder
from
deepspeech.utils
import
layer_tools
from
deepspeech.utils.checkpoint
import
Checkpoint
from
deepspeech.utils.log
import
Log
logger
=
Log
(
__name__
).
getlog
()
__all__
=
[
'DeepSpeech2ModelOnline'
,
'DeepSpeech2InferModeOnline'
]
...
...
@@ -55,46 +47,48 @@ class CRNNEncoder(nn.Layer):
self
.
num_rnn_layers
=
num_rnn_layers
self
.
num_fc_layers
=
num_fc_layers
self
.
fc_layers_size_list
=
fc_layers_size_list
self
.
conv
=
Conv
Stack
(
feat_size
,
num_conv_layers
)
self
.
conv
=
Conv
2dSubsampling4Online
(
feat_size
,
32
,
dropout_rate
=
0.0
)
i_size
=
self
.
conv
.
output_
height
# H after conv stack
i_size
=
self
.
conv
.
output_
dim
self
.
rnn
=
LayerList
()
self
.
layernorm_list
=
LayerList
()
self
.
fc_layers_list
=
LayerList
()
self
.
rnn
=
nn
.
LayerList
()
self
.
layernorm_list
=
nn
.
LayerList
()
self
.
fc_layers_list
=
nn
.
LayerList
()
rnn_direction
=
'forward'
layernorm_size
=
rnn_size
if
use_gru
==
True
:
self
.
rnn
.
append
(
GRU
(
input_size
=
i_size
,
nn
.
GRU
(
input_size
=
i_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
LayerNorm
(
layernorm_size
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
for
i
in
range
(
1
,
num_rnn_layers
):
self
.
rnn
.
append
(
GRU
(
input_size
=
layernorm_size
,
nn
.
GRU
(
input_size
=
layernorm_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
LayerNorm
(
layernorm_size
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
else
:
self
.
rnn
.
append
(
LSTM
(
nn
.
LSTM
(
input_size
=
i_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
LayerNorm
(
layernorm_size
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
for
i
in
range
(
1
,
num_rnn_layers
):
self
.
rnn
.
append
(
LSTM
(
nn
.
LSTM
(
input_size
=
layernorm_size
,
hidden_size
=
rnn_size
,
num_layers
=
1
,
direction
=
rnn_direction
))
self
.
layernorm_list
.
append
(
LayerNorm
(
layernorm_size
))
self
.
layernorm_list
.
append
(
nn
.
LayerNorm
(
layernorm_size
))
fc_input_size
=
layernorm_size
for
i
in
range
(
self
.
num_fc_layers
):
self
.
fc_layers_list
.
append
(
...
...
@@ -117,20 +111,16 @@ class CRNNEncoder(nn.Layer):
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
"""
# [B, T, D] -> [B, D, T]
audio
=
audio
.
transpose
([
0
,
2
,
1
])
# [B, D, T] -> [B, C=1, D, T]
x
=
audio
.
unsqueeze
(
1
)
# [B, T, D]
x
=
audio
x_lens
=
audio_len
# convolution group
x
,
x_lens
=
self
.
conv
(
x
,
x_lens
)
# convert data from convolution feature map to sequence of vectors
#B, C, D, T = paddle.shape(x) # not work under jit
x
=
x
.
transpose
([
0
,
3
,
1
,
2
])
#[B, T, C, D]
#
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit
x
=
x
.
reshape
([
0
,
0
,
-
1
])
#[B, T, C*D]
#
x = x.reshape([0, 0, -1]) #[B, T, C*D]
# remove padding part
x
,
output_state
=
self
.
rnn
[
0
](
x
,
None
,
x_lens
)
...
...
examples/aishell/s0/conf/deepspeech2.yaml
浏览文件 @
e4ef8ed3
...
...
@@ -36,11 +36,10 @@ collator:
model
:
num_conv_layers
:
2
num_rnn_layers
:
4
num_rnn_layers
:
3
rnn_layer_size
:
1024
use_gru
:
True
share_rnn_weights
:
False
apply_online
:
False
training
:
n_epoch
:
50
...
...
examples/librispeech/s0/conf/deepspeech2.yaml
浏览文件 @
e4ef8ed3
...
...
@@ -40,7 +40,6 @@ model:
rnn_layer_size
:
2048
use_gru
:
False
share_rnn_weights
:
True
apply_online
:
False
training
:
n_epoch
:
50
...
...
examples/tiny/s0/conf/deepspeech2.yaml
浏览文件 @
e4ef8ed3
...
...
@@ -41,7 +41,6 @@ model:
rnn_layer_size
:
2048
use_gru
:
False
share_rnn_weights
:
True
apply_online
:
True
training
:
n_epoch
:
10
...
...
tests/deepspeech2_model_test.py
浏览文件 @
e4ef8ed3
...
...
@@ -16,8 +16,8 @@ import unittest
import
numpy
as
np
import
paddle
#from deepspeech.models.deepspeech
2 import DeepSpeech2Model
from
deepspeech.models.ds2_online
import
DeepSpeech2ModelOnline
as
DeepSpeech2Model
from
deepspeech.models.ds
2
import
DeepSpeech2Model
class
TestDeepSpeech2Model
(
unittest
.
TestCase
):
def
setUp
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录