Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
fcd91c62
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
fcd91c62
编写于
3月 30, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add decoder
上级
63d78c88
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
333 addition
and
2 deletion
+333
-2
deepspeech/__init__.py
deepspeech/__init__.py
+8
-0
deepspeech/modules/decoder.py
deepspeech/modules/decoder.py
+178
-0
deepspeech/modules/decoder_layer.py
deepspeech/modules/decoder_layer.py
+146
-0
deepspeech/modules/mask.py
deepspeech/modules/mask.py
+1
-2
未找到文件。
deepspeech/__init__.py
浏览文件 @
fcd91c62
...
...
@@ -49,10 +49,18 @@ if not hasattr(paddle, 'softmax'):
logger
.
warn
(
"register user softmax to paddle, remove this when fixed!"
)
setattr
(
paddle
,
'softmax'
,
paddle
.
nn
.
functional
.
softmax
)
if
not
hasattr
(
paddle
,
'log_softmax'
):
logger
.
warn
(
"register user log_softmax to paddle, remove this when fixed!"
)
setattr
(
paddle
,
'log_softmax'
,
paddle
.
nn
.
functional
.
log_softmax
)
if
not
hasattr
(
paddle
,
'sigmoid'
):
logger
.
warn
(
"register user sigmoid to paddle, remove this when fixed!"
)
setattr
(
paddle
,
'sigmoid'
,
paddle
.
nn
.
functional
.
sigmoid
)
if
not
hasattr
(
paddle
,
'log_sigmoid'
):
logger
.
warn
(
"register user log_sigmoid to paddle, remove this when fixed!"
)
setattr
(
paddle
,
'log_sigmoid'
,
paddle
.
nn
.
functional
.
log_sigmoid
)
if
not
hasattr
(
paddle
,
'relu'
):
logger
.
warn
(
"register user relu to paddle, remove this when fixed!"
)
setattr
(
paddle
,
'relu'
,
paddle
.
nn
.
functional
.
relu
)
...
...
deepspeech/modules/decoder.py
0 → 100644
浏览文件 @
fcd91c62
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder definition."""
from
typing
import
Tuple
,
List
,
Optional
from
typeguard
import
check_argument_types
import
logging
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
paddle.nn
import
initializer
as
I
from
deepspeech.modules.attention
import
MultiHeadedAttention
from
deepspeech.modules.decoder_layer
import
DecoderLayer
from
deepspeech.modules.embedding
import
PositionalEncoding
from
deepspeech.modules.positionwise_feed_forward
import
PositionwiseFeedForward
from
deepspeech.modules.mask
import
subsequent_mask
from
deepspeech.modules.mask
import
make_pad_mask
logger
=
logging
.
getLogger
(
__name__
)
__all__
=
[
"TransformerDecoder"
]
class
TransformerDecoder
(
nn
.
Module
):
"""Base class of Transfomer decoder module.
Args:
vocab_size: output dim
encoder_output_size: dimension of attention
attention_heads: the number of heads of multi head attention
linear_units: the hidden units number of position-wise feedforward
num_blocks: the number of decoder blocks
dropout_rate: dropout rate
self_attention_dropout_rate: dropout rate for attention
input_layer: input layer type, `embed`
use_output_layer: whether to use output layer
pos_enc_class: PositionalEncoding module
normalize_before:
True: use layer_norm before each sub-block of a layer.
False: use layer_norm after each sub-block of a layer.
concat_after: whether to concat attention layer's input and output
True: x -> x + linear(concat(x, att(x)))
False: x -> x + att(x)
"""
def
__init__
(
self
,
vocab_size
:
int
,
encoder_output_size
:
int
,
attention_heads
:
int
=
4
,
linear_units
:
int
=
2048
,
num_blocks
:
int
=
6
,
dropout_rate
:
float
=
0.1
,
positional_dropout_rate
:
float
=
0.1
,
self_attention_dropout_rate
:
float
=
0.0
,
src_attention_dropout_rate
:
float
=
0.0
,
input_layer
:
str
=
"embed"
,
use_output_layer
:
bool
=
True
,
normalize_before
:
bool
=
True
,
concat_after
:
bool
=
False
,
):
assert
check_argument_types
()
super
().
__init__
()
attention_dim
=
encoder_output_size
if
input_layer
==
"embed"
:
self
.
embed
=
nn
.
Sequential
(
nn
.
Embedding
(
vocab_size
,
attention_dim
),
PositionalEncoding
(
attention_dim
,
positional_dropout_rate
),
)
else
:
raise
ValueError
(
f
"only 'embed' is supported:
{
input_layer
}
"
)
self
.
normalize_before
=
normalize_before
self
.
after_norm
=
nn
.
LayerNorm
(
attention_dim
,
epsilon
=
1e-12
)
self
.
use_output_layer
=
use_output_layer
self
.
output_layer
=
nn
.
Linear
(
attention_dim
,
vocab_size
)
self
.
decoders
=
nn
.
ModuleList
([
DecoderLayer
(
size
=
attention_dim
,
self_attn
=
MultiHeadedAttention
(
attention_heads
,
attention_dim
,
self_attention_dropout_rate
),
src_attn
=
MultiHeadedAttention
(
attention_heads
,
attention_dim
,
src_attention_dropout_rate
),
feed_forward
=
PositionwiseFeedForward
(
attention_dim
,
linear_units
,
dropout_rate
),
dropout_rate
=
dropout_rate
,
normalize_before
=
normalize_before
,
concat_after
=
concat_after
,
)
for
_
in
range
(
num_blocks
)
])
def
forward
(
self
,
memory
:
paddle
.
Tensor
,
memory_mask
:
paddle
.
Tensor
,
ys_in_pad
:
paddle
.
Tensor
,
ys_in_lens
:
paddle
.
Tensor
,
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
]:
"""Forward decoder.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoder memory mask, (batch, 1, maxlen_in)
ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
ys_in_lens: input lengths of this batch (batch)
Returns:
(tuple): tuple containing:
x: decoded token score before softmax (batch, maxlen_out, vocab_size)
if use_output_layer is True,
olens: (batch, )
"""
tgt
=
ys_in_pad
# tgt_mask: (B, 1, L)
tgt_mask
=
(
make_non_pad_mask
(
ys_in_lens
).
unsqueeze
(
1
))
# m: (1, L, L)
m
=
subsequent_mask
(
tgt_mask
.
size
(
-
1
)).
unsqueeze
(
0
)
# tgt_mask: (B, L, L)
tgt_mask
=
tgt_mask
&
m
x
,
_
=
self
.
embed
(
tgt
)
for
layer
in
self
.
decoders
:
x
,
tgt_mask
,
memory
,
memory_mask
=
layer
(
x
,
tgt_mask
,
memory
,
memory_mask
)
if
self
.
normalize_before
:
x
=
self
.
after_norm
(
x
)
if
self
.
use_output_layer
:
x
=
self
.
output_layer
(
x
)
olens
=
tgt_mask
.
sum
(
1
)
return
x
,
olens
def
forward_one_step
(
self
,
memory
:
paddle
.
Tensor
,
memory_mask
:
paddle
.
Tensor
,
tgt
:
paddle
.
Tensor
,
tgt_mask
:
paddle
.
Tensor
,
cache
:
Optional
[
List
[
paddle
.
Tensor
]]
=
None
,
)
->
Tuple
[
paddle
.
Tensor
,
List
[
paddle
.
Tensor
]]:
"""Forward one step.
This is only used for decoding.
Args:
memory: encoded memory, float32 (batch, maxlen_in, feat)
memory_mask: encoded memory mask, (batch, 1, maxlen_in)
tgt: input token ids, int64 (batch, maxlen_out)
tgt_mask: input token mask, (batch, maxlen_out)
dtype=paddle.bool
cache: cached output list of (batch, max_time_out-1, size)
Returns:
y, cache: NN output value and cache per `self.decoders`.
y.shape` is (batch, maxlen_out, token)
"""
x
,
_
=
self
.
embed
(
tgt
)
new_cache
=
[]
for
i
,
decoder
in
enumerate
(
self
.
decoders
):
if
cache
is
None
:
c
=
None
else
:
c
=
cache
[
i
]
x
,
tgt_mask
,
memory
,
memory_mask
=
decoder
(
x
,
tgt_mask
,
memory
,
memory_mask
,
cache
=
c
)
new_cache
.
append
(
x
)
if
self
.
normalize_before
:
y
=
self
.
after_norm
(
x
[:,
-
1
])
else
:
y
=
x
[:,
-
1
]
if
self
.
use_output_layer
:
y
=
paddle
.
log_softmax
(
self
.
output_layer
(
y
),
dim
=-
1
)
return
y
,
new_cache
deepspeech/modules/decoder_layer.py
0 → 100644
浏览文件 @
fcd91c62
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder self-attention layer definition."""
from
typing
import
Optional
,
Tuple
import
logging
import
paddle
from
paddle
import
nn
from
paddle.nn
import
functional
as
F
from
paddle.nn
import
initializer
as
I
logger
=
logging
.
getLogger
(
__name__
)
__all__
=
[
"DecoderLayer"
]
class
DecoderLayer
(
nn
.
Module
):
"""Single decoder layer module.
Args:
size (int): Input dimension.
self_attn (nn.Module): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
src_attn (nn.Module): Self-attention module instance.
`MultiHeadedAttention` instance can be used as the argument.
feed_forward (nn.Module): Feed-forward module instance.
`PositionwiseFeedForward` instance can be used as the argument.
dropout_rate (float): Dropout rate.
normalize_before (bool):
True: use layer_norm before each sub-block.
False: to use layer_norm after each sub-block.
concat_after (bool): Whether to concat attention layer's input
and output.
True: x -> x + linear(concat(x, att(x)))
False: x -> x + att(x)
"""
def
__init__
(
self
,
size
:
int
,
self_attn
:
nn
.
Module
,
src_attn
:
nn
.
Module
,
feed_forward
:
nn
.
Module
,
dropout_rate
:
float
,
normalize_before
:
bool
=
True
,
concat_after
:
bool
=
False
,
):
"""Construct an DecoderLayer object."""
super
().
__init__
()
self
.
size
=
size
self
.
self_attn
=
self_attn
self
.
src_attn
=
src_attn
self
.
feed_forward
=
feed_forward
self
.
norm1
=
nn
.
LayerNorm
(
size
,
epsilon
=
1e-12
)
self
.
norm2
=
nn
.
LayerNorm
(
size
,
epsilon
=
1e-12
)
self
.
norm3
=
nn
.
LayerNorm
(
size
,
epsilon
=
1e-12
)
self
.
dropout
=
nn
.
Dropout
(
dropout_rate
)
self
.
normalize_before
=
normalize_before
self
.
concat_after
=
concat_after
self
.
concat_linear1
=
nn
.
Linear
(
size
+
size
,
size
)
self
.
concat_linear2
=
nn
.
Linear
(
size
+
size
,
size
)
def
forward
(
self
,
tgt
:
paddle
.
Tensor
,
tgt_mask
:
paddle
.
Tensor
,
memory
:
paddle
.
Tensor
,
memory_mask
:
paddle
.
Tensor
,
cache
:
Optional
[
paddle
.
Tensor
]
=
None
)
->
Tuple
[
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
,
paddle
.
Tensor
]:
"""Compute decoded features.
Args:
tgt (paddle.Tensor): Input tensor (#batch, maxlen_out, size).
tgt_mask (paddle.Tensor): Mask for input tensor
(#batch, maxlen_out).
memory (paddle.Tensor): Encoded memory
(#batch, maxlen_in, size).
memory_mask (paddle.Tensor): Encoded memory mask
(#batch, maxlen_in).
cache (paddle.Tensor): cached tensors.
(#batch, maxlen_out - 1, size).
Returns:
paddle.Tensor: Output tensor (#batch, maxlen_out, size).
paddle.Tensor: Mask for output tensor (#batch, maxlen_out).
paddle.Tensor: Encoded memory (#batch, maxlen_in, size).
paddle.Tensor: Encoded memory mask (#batch, maxlen_in).
"""
residual
=
tgt
if
self
.
normalize_before
:
tgt
=
self
.
norm1
(
tgt
)
if
cache
is
None
:
tgt_q
=
tgt
tgt_q_mask
=
tgt_mask
else
:
# compute only the last frame query keeping dim: max_time_out -> 1
assert
cache
.
shape
==
(
tgt
.
shape
[
0
],
tgt
.
shape
[
1
]
-
1
,
self
.
size
,
),
"{cache.shape} == {(tgt.shape[0], tgt.shape[1] - 1, self.size)}"
tgt_q
=
tgt
[:,
-
1
:,
:]
residual
=
residual
[:,
-
1
:,
:]
tgt_q_mask
=
tgt_mask
[:,
-
1
:,
:]
if
self
.
concat_after
:
tgt_concat
=
paddle
.
cat
(
(
tgt_q
,
self
.
self_attn
(
tgt_q
,
tgt
,
tgt
,
tgt_q_mask
)),
dim
=-
1
)
x
=
residual
+
self
.
concat_linear1
(
tgt_concat
)
else
:
x
=
residual
+
self
.
dropout
(
self
.
self_attn
(
tgt_q
,
tgt
,
tgt
,
tgt_q_mask
))
if
not
self
.
normalize_before
:
x
=
self
.
norm1
(
x
)
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
if
self
.
concat_after
:
x_concat
=
paddle
.
cat
(
(
x
,
self
.
src_attn
(
x
,
memory
,
memory
,
memory_mask
)),
dim
=-
1
)
x
=
residual
+
self
.
concat_linear2
(
x_concat
)
else
:
x
=
residual
+
self
.
dropout
(
self
.
src_attn
(
x
,
memory
,
memory
,
memory_mask
))
if
not
self
.
normalize_before
:
x
=
self
.
norm2
(
x
)
residual
=
x
if
self
.
normalize_before
:
x
=
self
.
norm3
(
x
)
x
=
residual
+
self
.
dropout
(
self
.
feed_forward
(
x
))
if
not
self
.
normalize_before
:
x
=
self
.
norm3
(
x
)
if
cache
is
not
None
:
x
=
paddle
.
cat
([
cache
,
x
],
dim
=
1
)
return
x
,
tgt_mask
,
memory
,
memory_mask
deepspeech/modules/mask.py
浏览文件 @
fcd91c62
...
...
@@ -50,8 +50,7 @@ def sequence_mask(x_len, max_len=None, dtype='float32'):
return
mask
def
subsequent_mask
(
size
:
int
,
)
->
paddle
.
Tensor
:
def
subsequent_mask
(
size
:
int
)
->
paddle
.
Tensor
:
"""Create mask for subsequent steps (size, size).
This mask is used only in decoder which works in an auto-regressive mode.
This means the current step could only do attention with its left steps.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录