Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
4c838379
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4c838379
编写于
7月 02, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
size to shape; repeat to tile
上级
94918305
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
47 addition
and
38 deletion
+47
-38
deepspeech/exps/u2/model.py
deepspeech/exps/u2/model.py
+14
-4
deepspeech/models/u2.py
deepspeech/models/u2.py
+18
-19
deepspeech/modules/attention.py
deepspeech/modules/attention.py
+7
-7
deepspeech/modules/decoder.py
deepspeech/modules/decoder.py
+1
-1
deepspeech/modules/embedding.py
deepspeech/modules/embedding.py
+2
-2
deepspeech/utils/tensor_utils.py
deepspeech/utils/tensor_utils.py
+5
-5
未找到文件。
deepspeech/exps/u2/model.py
浏览文件 @
4c838379
...
...
@@ -532,7 +532,7 @@ class U2Tester(U2Trainer):
# 1. Encoder
encoder_out
,
encoder_mask
=
self
.
model
.
_forward_encoder
(
feat
,
feats_length
)
# (B, maxlen, encoder_dim)
maxlen
=
encoder_out
.
s
ize
(
1
)
maxlen
=
encoder_out
.
s
hape
[
1
]
ctc_probs
=
self
.
model
.
ctc
.
log_softmax
(
encoder_out
)
# (1, maxlen, vocab_size)
...
...
@@ -598,10 +598,20 @@ class U2Tester(U2Trainer):
def
export
(
self
):
infer_model
,
input_spec
=
self
.
load_inferspec
()
assert
isinstance
(
input_spec
,
list
),
type
(
input_spec
)
#
assert isinstance(input_spec, list), type(input_spec)
infer_model
.
eval
()
static_model
=
paddle
.
jit
.
to_static
(
infer_model
,
input_spec
=
input_spec
)
logger
.
info
(
f
"Export code:
{
static_model
.
forward
.
code
}
"
)
#static_model = paddle.jit.to_static(infer_model., input_spec=input_spec)
static_model
=
paddle
.
jit
.
to_static
(
infer_model
.
forward_attention_decoder
,
input_spec
=
[
paddle
.
static
.
InputSpec
(
shape
=
[
1
,
None
],
dtype
=
'int32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
1
],
dtype
=
'int32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
1
,
None
,
256
],
dtype
=
'int32'
),
]
)
logger
.
info
(
f
"Export code:
{
static_model
}
"
)
paddle
.
jit
.
save
(
static_model
,
self
.
args
.
export_path
)
def
run_export
(
self
):
...
...
deepspeech/models/u2.py
浏览文件 @
4c838379
...
...
@@ -299,8 +299,8 @@ class U2BaseModel(nn.Module):
speech
,
speech_lengths
,
decoding_chunk_size
,
num_decoding_left_chunks
,
simulate_streaming
)
# (B, maxlen, encoder_dim)
maxlen
=
encoder_out
.
s
ize
(
1
)
encoder_dim
=
encoder_out
.
s
ize
(
2
)
maxlen
=
encoder_out
.
s
hape
[
1
]
encoder_dim
=
encoder_out
.
s
hape
[
2
]
running_size
=
batch_size
*
beam_size
encoder_out
=
encoder_out
.
unsqueeze
(
1
).
repeat
(
1
,
beam_size
,
1
,
1
).
view
(
running_size
,
maxlen
,
encoder_dim
)
# (B*N, maxlen, encoder_dim)
...
...
@@ -405,7 +405,7 @@ class U2BaseModel(nn.Module):
encoder_out
,
encoder_mask
=
self
.
_forward_encoder
(
speech
,
speech_lengths
,
decoding_chunk_size
,
num_decoding_left_chunks
,
simulate_streaming
)
maxlen
=
encoder_out
.
s
ize
(
1
)
maxlen
=
encoder_out
.
s
hape
[
1
]
# (TODO Hui Zhang): bool no support reduce_sum
# encoder_out_lens = encoder_mask.squeeze(1).sum(1)
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
astype
(
paddle
.
int
).
sum
(
1
)
...
...
@@ -455,7 +455,7 @@ class U2BaseModel(nn.Module):
speech
,
speech_lengths
,
decoding_chunk_size
,
num_decoding_left_chunks
,
simulate_streaming
)
# (B, maxlen, encoder_dim)
maxlen
=
encoder_out
.
s
ize
(
1
)
maxlen
=
encoder_out
.
s
hape
[
1
]
ctc_probs
=
self
.
ctc
.
log_softmax
(
encoder_out
)
# (1, maxlen, vocab_size)
ctc_probs
=
ctc_probs
.
squeeze
(
0
)
# cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
...
...
@@ -578,7 +578,7 @@ class U2BaseModel(nn.Module):
hyps_lens
=
hyps_lens
+
1
# Add <sos> at begining
encoder_out
=
encoder_out
.
repeat
(
beam_size
,
1
,
1
)
encoder_mask
=
paddle
.
ones
(
(
beam_size
,
1
,
encoder_out
.
s
ize
(
1
)
),
dtype
=
paddle
.
bool
)
(
beam_size
,
1
,
encoder_out
.
s
hape
[
1
]
),
dtype
=
paddle
.
bool
)
decoder_out
,
_
=
self
.
decoder
(
encoder_out
,
encoder_mask
,
hyps_pad
,
hyps_lens
)
# (beam_size, max_hyps_len, vocab_size)
...
...
@@ -624,7 +624,7 @@ class U2BaseModel(nn.Module):
"""
return
self
.
eos
@
jit
.
export
#
@jit.export
def
forward_encoder_chunk
(
self
,
xs
:
paddle
.
Tensor
,
...
...
@@ -654,9 +654,7 @@ class U2BaseModel(nn.Module):
xs
,
offset
,
required_cache_size
,
subsampling_cache
,
elayers_output_cache
,
conformer_cnn_cache
)
# @jit.export([
# paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D]
# ])
# @jit.export
def
ctc_activation
(
self
,
xs
:
paddle
.
Tensor
)
->
paddle
.
Tensor
:
""" Export interface for c++ call, apply linear transform and log
softmax before ctc
...
...
@@ -667,7 +665,7 @@ class U2BaseModel(nn.Module):
"""
return
self
.
ctc
.
log_softmax
(
xs
)
@
jit
.
export
#
@jit.export
def
forward_attention_decoder
(
self
,
hyps
:
paddle
.
Tensor
,
...
...
@@ -683,13 +681,14 @@ class U2BaseModel(nn.Module):
Returns:
paddle.Tensor: decoder output, (B, L)
"""
assert
encoder_out
.
size
(
0
)
==
1
num_hyps
=
hyps
.
size
(
0
)
assert
hyps_lens
.
size
(
0
)
==
num_hyps
encoder_out
=
encoder_out
.
repeat
(
num_hyps
,
1
,
1
)
assert
encoder_out
.
shape
[
0
]
==
1
num_hyps
=
hyps
.
shape
[
0
]
assert
hyps_lens
.
shape
[
0
]
==
num_hyps
# encoder_out = encoder_out.repeat(num_hyps, 1, 1)
encoder_out
=
encoder_out
.
tile
([
num_hyps
,
1
,
1
])
# (B, 1, T)
encoder_mask
=
paddle
.
ones
(
[
num_hyps
,
1
,
encoder_out
.
s
ize
(
1
)
],
dtype
=
paddle
.
bool
)
[
num_hyps
,
1
,
encoder_out
.
s
hape
[
1
]
],
dtype
=
paddle
.
bool
)
# (num_hyps, max_hyps_len, vocab_size)
decoder_out
,
_
=
self
.
decoder
(
encoder_out
,
encoder_mask
,
hyps
,
hyps_lens
)
...
...
@@ -744,7 +743,7 @@ class U2BaseModel(nn.Module):
Returns:
List[List[int]]: transcripts.
"""
batch_size
=
feats
.
s
ize
(
0
)
batch_size
=
feats
.
s
hape
[
0
]
if
decoding_method
in
[
'ctc_prefix_beam_search'
,
'attention_rescoring'
]
and
batch_size
>
1
:
logger
.
fatal
(
...
...
@@ -772,7 +771,7 @@ class U2BaseModel(nn.Module):
# result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode
elif
decoding_method
==
'ctc_prefix_beam_search'
:
assert
feats
.
s
ize
(
0
)
==
1
assert
feats
.
s
hape
[
0
]
==
1
hyp
=
self
.
ctc_prefix_beam_search
(
feats
,
feats_lengths
,
...
...
@@ -782,7 +781,7 @@ class U2BaseModel(nn.Module):
simulate_streaming
=
simulate_streaming
)
hyps
=
[
hyp
]
elif
decoding_method
==
'attention_rescoring'
:
assert
feats
.
s
ize
(
0
)
==
1
assert
feats
.
s
hape
[
0
]
==
1
hyp
=
self
.
attention_rescoring
(
feats
,
feats_lengths
,
...
...
@@ -922,7 +921,7 @@ class U2InferModel(U2Model):
Returns:
List[List[int]]: best path result
"""
return
self
.
ctc_greedy_search
(
return
self
.
attention_rescoring
(
feats
,
feats_lengths
,
decoding_chunk_size
=
decoding_chunk_size
,
...
...
deepspeech/modules/attention.py
浏览文件 @
4c838379
...
...
@@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor: Transformed value tensor, size
(#batch, n_head, time2, d_k).
"""
n_batch
=
query
.
s
ize
(
0
)
n_batch
=
query
.
s
hape
[
0
]
q
=
self
.
linear_q
(
query
).
view
(
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
)
k
=
self
.
linear_k
(
key
).
view
(
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
)
v
=
self
.
linear_v
(
value
).
view
(
n_batch
,
-
1
,
self
.
h
,
self
.
d_k
)
...
...
@@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
paddle.Tensor: Transformed value weighted
by the attention score, (#batch, time1, d_model).
"""
n_batch
=
value
.
s
ize
(
0
)
n_batch
=
value
.
s
hape
[
0
]
if
mask
is
not
None
:
mask
=
mask
.
unsqueeze
(
1
).
eq
(
0
)
# (batch, 1, *, time2)
scores
=
scores
.
masked_fill
(
mask
,
-
float
(
'inf'
))
...
...
@@ -172,15 +172,15 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
paddle.Tensor: Output tensor. (batch, head, time1, time1)
"""
zero_pad
=
paddle
.
zeros
(
(
x
.
s
ize
(
0
),
x
.
size
(
1
),
x
.
size
(
2
)
,
1
),
dtype
=
x
.
dtype
)
(
x
.
s
hape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
2
]
,
1
),
dtype
=
x
.
dtype
)
x_padded
=
paddle
.
cat
([
zero_pad
,
x
],
dim
=-
1
)
x_padded
=
x_padded
.
view
(
x
.
s
ize
(
0
),
x
.
size
(
1
),
x
.
size
(
3
)
+
1
,
x
.
size
(
2
)
)
x_padded
=
x_padded
.
view
(
x
.
s
hape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
3
]
+
1
,
x
.
shape
[
2
]
)
x
=
x_padded
[:,
:,
1
:].
view_as
(
x
)
# [B, H, T1, T1]
if
zero_triu
:
ones
=
paddle
.
ones
((
x
.
s
ize
(
2
),
x
.
size
(
3
)
))
x
=
x
*
paddle
.
tril
(
ones
,
x
.
s
ize
(
3
)
-
x
.
size
(
2
)
)[
None
,
None
,
:,
:]
ones
=
paddle
.
ones
((
x
.
s
hape
[
2
],
x
.
shape
[
3
]
))
x
=
x
*
paddle
.
tril
(
ones
,
x
.
s
hape
[
3
]
-
x
.
shape
[
2
]
)[
None
,
None
,
:,
:]
return
x
...
...
@@ -205,7 +205,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
q
,
k
,
v
=
self
.
forward_qkv
(
query
,
key
,
value
)
q
=
q
.
transpose
([
0
,
2
,
1
,
3
])
# (batch, time1, head, d_k)
n_batch_pos
=
pos_emb
.
s
ize
(
0
)
n_batch_pos
=
pos_emb
.
s
hape
[
0
]
p
=
self
.
linear_pos
(
pos_emb
).
view
(
n_batch_pos
,
-
1
,
self
.
h
,
self
.
d_k
)
p
=
p
.
transpose
([
0
,
2
,
1
,
3
])
# (batch, head, time1, d_k)
...
...
deepspeech/modules/decoder.py
浏览文件 @
4c838379
...
...
@@ -122,7 +122,7 @@ class TransformerDecoder(nn.Module):
# tgt_mask: (B, 1, L)
tgt_mask
=
(
make_non_pad_mask
(
ys_in_lens
).
unsqueeze
(
1
))
# m: (1, L, L)
m
=
subsequent_mask
(
tgt_mask
.
s
ize
(
-
1
)
).
unsqueeze
(
0
)
m
=
subsequent_mask
(
tgt_mask
.
s
hape
[
-
1
]
).
unsqueeze
(
0
)
# tgt_mask: (B, L, L)
# TODO(Hui Zhang): not support & for tensor
# tgt_mask = tgt_mask & m
...
...
deepspeech/modules/embedding.py
浏览文件 @
4c838379
...
...
@@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer):
paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...)
"""
T
=
x
.
shape
[
1
]
assert
offset
+
x
.
s
ize
(
1
)
<
self
.
max_len
assert
offset
+
x
.
s
hape
[
1
]
<
self
.
max_len
#TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
pos_emb
=
self
.
pe
[:,
offset
:
offset
+
T
]
x
=
x
*
self
.
xscale
+
pos_emb
...
...
@@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding):
paddle.Tensor: Encoded tensor (batch, time, `*`).
paddle.Tensor: Positional embedding tensor (1, time, `*`).
"""
assert
offset
+
x
.
s
ize
(
1
)
<
self
.
max_len
assert
offset
+
x
.
s
hape
[
1
]
<
self
.
max_len
x
=
x
*
self
.
xscale
#TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
pos_emb
=
self
.
pe
[:,
offset
:
offset
+
x
.
shape
[
1
]]
...
...
deepspeech/utils/tensor_utils.py
浏览文件 @
4c838379
...
...
@@ -65,11 +65,11 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
max_size
=
sequences
[
0
].
s
ize
()
max_size
=
sequences
[
0
].
s
hape
# (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:]
trailing_dims
=
max_size
[
1
:]
if
max_size
.
ndim
>=
2
else
()
max_len
=
max
([
s
.
s
ize
(
0
)
for
s
in
sequences
])
max_len
=
max
([
s
.
s
hape
[
0
]
for
s
in
sequences
])
if
batch_first
:
out_dims
=
(
len
(
sequences
),
max_len
)
+
trailing_dims
else
:
...
...
@@ -77,7 +77,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
out_tensor
=
sequences
[
0
].
new_full
(
out_dims
,
padding_value
)
for
i
,
tensor
in
enumerate
(
sequences
):
length
=
tensor
.
s
ize
(
0
)
length
=
tensor
.
s
hape
[
0
]
# use index notation to prevent duplicate references to the tensor
if
batch_first
:
out_tensor
[
i
,
:
length
,
...]
=
tensor
...
...
@@ -125,7 +125,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
#ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
#ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
#return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
B
=
ys_pad
.
s
ize
(
0
)
B
=
ys_pad
.
s
hape
[
0
]
_sos
=
paddle
.
ones
([
B
,
1
],
dtype
=
ys_pad
.
dtype
)
*
sos
_eos
=
paddle
.
ones
([
B
,
1
],
dtype
=
ys_pad
.
dtype
)
*
eos
ys_in
=
paddle
.
cat
([
_sos
,
ys_pad
],
dim
=
1
)
...
...
@@ -152,7 +152,7 @@ def th_accuracy(pad_outputs: paddle.Tensor,
float: Accuracy value (0.0 - 1.0).
"""
pad_pred
=
pad_outputs
.
view
(
pad_targets
.
s
ize
(
0
)
,
pad_targets
.
size
(
1
),
pad_outputs
.
size
(
1
)).
argmax
(
2
)
pad_targets
.
s
hape
[
0
]
,
pad_targets
.
size
(
1
),
pad_outputs
.
size
(
1
)).
argmax
(
2
)
mask
=
pad_targets
!=
ignore_label
#TODO(Hui Zhang): sum not support bool type
# numerator = paddle.sum(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录