Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
6de81d74
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6de81d74
编写于
9月 26, 2022
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
elimiete cast dtype for bool op
上级
8e7a315e
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
18 addition
and
53 deletion
+18
-53
paddlespeech/s2t/models/u2/u2.py
paddlespeech/s2t/models/u2/u2.py
+3
-9
paddlespeech/s2t/models/u2_st/u2_st.py
paddlespeech/s2t/models/u2_st/u2_st.py
+1
-4
paddlespeech/s2t/modules/decoder.py
paddlespeech/s2t/modules/decoder.py
+2
-6
paddlespeech/s2t/modules/encoder.py
paddlespeech/s2t/modules/encoder.py
+7
-19
paddlespeech/s2t/modules/mask.py
paddlespeech/s2t/modules/mask.py
+2
-7
paddlespeech/s2t/utils/tensor_utils.py
paddlespeech/s2t/utils/tensor_utils.py
+3
-8
未找到文件。
paddlespeech/s2t/models/u2/u2.py
浏览文件 @
6de81d74
...
@@ -124,10 +124,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
...
@@ -124,10 +124,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
encoder_out
,
encoder_mask
=
self
.
encoder
(
speech
,
speech_lengths
)
encoder_out
,
encoder_mask
=
self
.
encoder
(
speech
,
speech_lengths
)
encoder_time
=
time
.
time
()
-
start
encoder_time
=
time
.
time
()
-
start
#logger.debug(f"encoder time: {encoder_time}")
#logger.debug(f"encoder time: {encoder_time}")
#TODO(Hui Zhang): sum not support bool type
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
sum
(
1
)
#[B, 1, T] -> [B]
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
cast
(
paddle
.
int64
).
sum
(
1
)
#[B, 1, T] -> [B]
# 2a. Attention-decoder branch
# 2a. Attention-decoder branch
loss_att
=
None
loss_att
=
None
...
@@ -291,8 +288,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
...
@@ -291,8 +288,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
# 2. Decoder forward step by step
# 2. Decoder forward step by step
for
i
in
range
(
1
,
maxlen
+
1
):
for
i
in
range
(
1
,
maxlen
+
1
):
# Stop if all batch and all beam produce eos
# Stop if all batch and all beam produce eos
# TODO(Hui Zhang): if end_flag.sum() == running_size:
if
end_flag
.
sum
()
==
running_size
:
if
end_flag
.
cast
(
paddle
.
int64
).
sum
()
==
running_size
:
break
break
# 2.1 Forward decoder step
# 2.1 Forward decoder step
...
@@ -378,9 +374,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
...
@@ -378,9 +374,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
speech
,
speech_lengths
,
decoding_chunk_size
,
speech
,
speech_lengths
,
decoding_chunk_size
,
num_decoding_left_chunks
,
simulate_streaming
)
num_decoding_left_chunks
,
simulate_streaming
)
maxlen
=
encoder_out
.
shape
[
1
]
maxlen
=
encoder_out
.
shape
[
1
]
# (TODO Hui Zhang): bool no support reduce_sum
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
sum
(
1
)
# encoder_out_lens = encoder_mask.squeeze(1).sum(1)
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
astype
(
paddle
.
int
).
sum
(
1
)
ctc_probs
=
self
.
ctc
.
log_softmax
(
encoder_out
)
# (B, maxlen, vocab_size)
ctc_probs
=
self
.
ctc
.
log_softmax
(
encoder_out
)
# (B, maxlen, vocab_size)
topk_prob
,
topk_index
=
ctc_probs
.
topk
(
1
,
axis
=
2
)
# (B, maxlen, 1)
topk_prob
,
topk_index
=
ctc_probs
.
topk
(
1
,
axis
=
2
)
# (B, maxlen, 1)
...
...
paddlespeech/s2t/models/u2_st/u2_st.py
浏览文件 @
6de81d74
...
@@ -111,10 +111,7 @@ class U2STBaseModel(nn.Layer):
...
@@ -111,10 +111,7 @@ class U2STBaseModel(nn.Layer):
encoder_out
,
encoder_mask
=
self
.
encoder
(
speech
,
speech_lengths
)
encoder_out
,
encoder_mask
=
self
.
encoder
(
speech
,
speech_lengths
)
encoder_time
=
time
.
time
()
-
start
encoder_time
=
time
.
time
()
-
start
#logger.debug(f"encoder time: {encoder_time}")
#logger.debug(f"encoder time: {encoder_time}")
#TODO(Hui Zhang): sum not support bool type
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
sum
(
1
)
#[B, 1, T] -> [B]
#encoder_out_lens = encoder_mask.squeeze(1).sum(1) #[B, 1, T] -> [B]
encoder_out_lens
=
encoder_mask
.
squeeze
(
1
).
cast
(
paddle
.
int64
).
sum
(
1
)
#[B, 1, T] -> [B]
# 2a. ST-decoder branch
# 2a. ST-decoder branch
start
=
time
.
time
()
start
=
time
.
time
()
...
...
paddlespeech/s2t/modules/decoder.py
浏览文件 @
6de81d74
...
@@ -140,9 +140,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
...
@@ -140,9 +140,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
# m: (1, L, L)
# m: (1, L, L)
m
=
subsequent_mask
(
tgt_mask
.
shape
[
-
1
]).
unsqueeze
(
0
)
m
=
subsequent_mask
(
tgt_mask
.
shape
[
-
1
]).
unsqueeze
(
0
)
# tgt_mask: (B, L, L)
# tgt_mask: (B, L, L)
# TODO(Hui Zhang): not support & for tensor
tgt_mask
=
tgt_mask
&
m
# tgt_mask = tgt_mask & m
tgt_mask
=
tgt_mask
.
logical_and
(
m
)
x
,
_
=
self
.
embed
(
tgt
)
x
,
_
=
self
.
embed
(
tgt
)
for
layer
in
self
.
decoders
:
for
layer
in
self
.
decoders
:
...
@@ -153,9 +151,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
...
@@ -153,9 +151,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
if
self
.
use_output_layer
:
if
self
.
use_output_layer
:
x
=
self
.
output_layer
(
x
)
x
=
self
.
output_layer
(
x
)
# TODO(Hui Zhang): reduce_sum not support bool type
olens
=
tgt_mask
.
sum
(
1
)
# olens = tgt_mask.sum(1)
olens
=
tgt_mask
.
astype
(
paddle
.
int
).
sum
(
1
)
return
x
,
paddle
.
to_tensor
(
0.0
),
olens
return
x
,
paddle
.
to_tensor
(
0.0
),
olens
def
forward_one_step
(
def
forward_one_step
(
...
...
paddlespeech/s2t/modules/encoder.py
浏览文件 @
6de81d74
...
@@ -164,12 +164,8 @@ class BaseEncoder(nn.Layer):
...
@@ -164,12 +164,8 @@ class BaseEncoder(nn.Layer):
if
self
.
global_cmvn
is
not
None
:
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
xs
=
self
.
global_cmvn
(
xs
)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
,
offset
=
0
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
mask_pad
=
~
masks
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks
=
masks
.
astype
(
paddle
.
bool
)
#TODO(Hui Zhang): mask_pad = ~masks
mask_pad
=
masks
.
logical_not
()
chunk_masks
=
add_optional_chunk_mask
(
chunk_masks
=
add_optional_chunk_mask
(
xs
,
masks
,
self
.
use_dynamic_chunk
,
self
.
use_dynamic_left_chunk
,
xs
,
masks
,
self
.
use_dynamic_chunk
,
self
.
use_dynamic_left_chunk
,
decoding_chunk_size
,
self
.
static_chunk_size
,
decoding_chunk_size
,
self
.
static_chunk_size
,
...
@@ -215,11 +211,8 @@ class BaseEncoder(nn.Layer):
...
@@ -215,11 +211,8 @@ class BaseEncoder(nn.Layer):
same shape as the original cnn_cache
same shape as the original cnn_cache
"""
"""
assert
xs
.
shape
[
0
]
==
1
# batch size must be one
assert
xs
.
shape
[
0
]
==
1
# batch size must be one
# tmp_masks is just for interface compatibility
# tmp_masks is just for interface compatibility, [B=1, C=1, T]
# TODO(Hui Zhang): stride_slice not support bool tensor
tmp_masks
=
paddle
.
ones
([
1
,
1
,
xs
.
shape
[
1
]],
dtype
=
paddle
.
bool
)
# tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool)
tmp_masks
=
paddle
.
ones
([
1
,
xs
.
shape
[
1
]],
dtype
=
paddle
.
int32
)
tmp_masks
=
tmp_masks
.
unsqueeze
(
1
)
#[B=1, C=1, T]
if
self
.
global_cmvn
is
not
None
:
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
xs
=
self
.
global_cmvn
(
xs
)
...
@@ -228,9 +221,8 @@ class BaseEncoder(nn.Layer):
...
@@ -228,9 +221,8 @@ class BaseEncoder(nn.Layer):
xs
,
pos_emb
,
_
=
self
.
embed
(
xs
,
tmp_masks
,
offset
=
offset
)
xs
,
pos_emb
,
_
=
self
.
embed
(
xs
,
tmp_masks
,
offset
=
offset
)
# after embed, xs=(B=1, chunk_size, hidden-dim)
# after embed, xs=(B=1, chunk_size, hidden-dim)
elayers
=
paddle
.
shape
(
att_cache
)[
0
]
elayers
,
_
,
cache_t1
,
_
=
att_cache
.
shape
cache_t1
=
paddle
.
shape
(
att_cache
)[
2
]
chunk_size
=
xs
.
shape
[
1
]
chunk_size
=
paddle
.
shape
(
xs
)[
1
]
attention_key_size
=
cache_t1
+
chunk_size
attention_key_size
=
cache_t1
+
chunk_size
# only used when using `RelPositionMultiHeadedAttention`
# only used when using `RelPositionMultiHeadedAttention`
...
@@ -402,11 +394,7 @@ class TransformerEncoder(BaseEncoder):
...
@@ -402,11 +394,7 @@ class TransformerEncoder(BaseEncoder):
if
self
.
global_cmvn
is
not
None
:
if
self
.
global_cmvn
is
not
None
:
xs
=
self
.
global_cmvn
(
xs
)
xs
=
self
.
global_cmvn
(
xs
)
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
,
offset
=
0
)
xs
,
pos_emb
,
masks
=
self
.
embed
(
xs
,
masks
.
astype
(
xs
.
dtype
),
offset
=
0
)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks
=
masks
.
astype
(
paddle
.
bool
)
if
cache
is
None
:
if
cache
is
None
:
cache
=
[
None
for
_
in
range
(
len
(
self
.
encoders
))]
cache
=
[
None
for
_
in
range
(
len
(
self
.
encoders
))]
new_cache
=
[]
new_cache
=
[]
...
...
paddlespeech/s2t/modules/mask.py
浏览文件 @
6de81d74
...
@@ -109,13 +109,8 @@ def subsequent_mask(size: int) -> paddle.Tensor:
...
@@ -109,13 +109,8 @@ def subsequent_mask(size: int) -> paddle.Tensor:
[1, 1, 1]]
[1, 1, 1]]
"""
"""
ret
=
paddle
.
ones
([
size
,
size
],
dtype
=
paddle
.
bool
)
ret
=
paddle
.
ones
([
size
,
size
],
dtype
=
paddle
.
bool
)
#TODO(Hui Zhang): tril not support bool
return
paddle
.
tril
(
ret
)
#return paddle.tril(ret)
ret
=
ret
.
astype
(
paddle
.
float
)
ret
=
paddle
.
tril
(
ret
)
ret
=
ret
.
astype
(
paddle
.
bool
)
return
ret
def
subsequent_chunk_mask
(
def
subsequent_chunk_mask
(
size
:
int
,
size
:
int
,
...
...
paddlespeech/s2t/utils/tensor_utils.py
浏览文件 @
6de81d74
...
@@ -184,13 +184,8 @@ def th_accuracy(pad_outputs: paddle.Tensor,
...
@@ -184,13 +184,8 @@ def th_accuracy(pad_outputs: paddle.Tensor,
pad_pred
=
pad_outputs
.
view
(
pad_targets
.
shape
[
0
],
pad_targets
.
shape
[
1
],
pad_pred
=
pad_outputs
.
view
(
pad_targets
.
shape
[
0
],
pad_targets
.
shape
[
1
],
pad_outputs
.
shape
[
1
]).
argmax
(
2
)
pad_outputs
.
shape
[
1
]).
argmax
(
2
)
mask
=
pad_targets
!=
ignore_label
mask
=
pad_targets
!=
ignore_label
#TODO(Hui Zhang): sum not support bool type
# numerator = paddle.sum(
numerator
=
paddle
.
sum
(
# pad_pred.masked_select(mask) == pad_targets.masked_select(mask))
numerator
=
(
pad_pred
.
masked_select
(
mask
)
==
pad_targets
.
masked_select
(
mask
))
pad_pred
.
masked_select
(
mask
)
==
pad_targets
.
masked_select
(
mask
))
numerator
=
paddle
.
sum
(
numerator
.
type_as
(
pad_targets
))
denominator
=
paddle
.
sum
(
mask
)
#TODO(Hui Zhang): sum not support bool type
# denominator = paddle.sum(mask)
denominator
=
paddle
.
sum
(
mask
.
type_as
(
pad_targets
))
return
float
(
numerator
)
/
float
(
denominator
)
return
float
(
numerator
)
/
float
(
denominator
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录