Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
9c7f0762
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
9c7f0762
编写于
1月 17, 2022
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update racotron2 and transformer tts, test=tts
上级
89e988a6
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
132 addition
and
371 deletion
+132
-371
paddlespeech/t2s/models/new_tacotron2/tacotron2.py
paddlespeech/t2s/models/new_tacotron2/tacotron2.py
+4
-1
paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
+6
-13
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+7
-320
paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
...ech/t2s/models/transformer_tts/transformer_tts_updater.py
+24
-10
paddlespeech/t2s/modules/losses.py
paddlespeech/t2s/modules/losses.py
+91
-27
未找到文件。
paddlespeech/t2s/models/new_tacotron2/tacotron2.py
浏览文件 @
9c7f0762
...
...
@@ -324,7 +324,10 @@ class Tacotron2(nn.Layer):
ys
=
ys
[:,
:
max_out
]
labels
=
labels
[:,
:
max_out
]
labels
=
paddle
.
scatter
(
labels
,
1
,
(
olens
-
1
).
unsqueeze
(
1
),
1.0
)
return
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
ilens
olens_in
=
olens
//
self
.
reduction_factor
else
:
olens_in
=
olens
return
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
olens_in
def
_forward
(
self
,
...
...
paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
浏览文件 @
9c7f0762
...
...
@@ -72,11 +72,10 @@ class Tacotron2Updater(StandardUpdater):
# spk_id!=None in multiple spk fastspeech2
spk_id
=
batch
[
"spk_id"
]
if
"spk_id"
in
batch
else
None
spk_emb
=
batch
[
"spk_emb"
]
if
"spk_emb"
in
batch
else
None
# No explicit speaker identifier labels are used during voice cloning training.
if
spk_emb
is
not
None
:
spk_id
=
None
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
ilens
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
olens_in
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -101,11 +100,8 @@ class Tacotron2Updater(StandardUpdater):
if
self
.
use_guided_attn_loss
:
# NOTE: length of output for auto-regressive
# input will be changed when r > 1
if
self
.
model
.
reduction_factor
>
1
:
olens_in
=
olens
//
self
.
model
.
reduction_factor
else
:
olens_in
=
olens
attn_loss
=
self
.
attn_loss
(
att_ws
,
ilens
,
olens_in
)
attn_loss
=
self
.
attn_loss
(
att_ws
=
att_ws
,
ilens
=
batch
[
"text_lengths"
]
+
1
,
olens
=
olens_in
)
loss
=
loss
+
attn_loss
optimizer
=
self
.
optimizer
...
...
@@ -169,7 +165,7 @@ class Tacotron2Evaluator(StandardEvaluator):
if
spk_emb
is
not
None
:
spk_id
=
None
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
ilens
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
att_ws
,
olens_in
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -194,11 +190,8 @@ class Tacotron2Evaluator(StandardEvaluator):
if
self
.
use_guided_attn_loss
:
# NOTE: length of output for auto-regressive
# input will be changed when r > 1
if
self
.
model
.
reduction_factor
>
1
:
olens_in
=
olens
//
self
.
model
.
reduction_factor
else
:
olens_in
=
olens
attn_loss
=
self
.
attn_loss
(
att_ws
,
ilens
,
olens_in
)
attn_loss
=
self
.
attn_loss
(
att_ws
=
att_ws
,
ilens
=
batch
[
"text_lengths"
]
+
1
,
olens
=
olens_in
)
loss
=
loss
+
attn_loss
report
(
"eval/l1_loss"
,
float
(
l1_loss
))
...
...
paddlespeech/t2s/models/transformer_tts/transformer_tts.py
浏览文件 @
9c7f0762
...
...
@@ -447,12 +447,15 @@ class TransformerTTS(nn.Layer):
# modifiy mod part of groundtruth
if
self
.
reduction_factor
>
1
:
olens
=
paddle
.
to_tensor
(
[
olen
-
olen
%
self
.
reduction_factor
for
olen
in
olens
.
numpy
()])
olens
=
olens
-
olens
%
self
.
reduction_factor
max_olen
=
max
(
olens
)
ys
=
ys
[:,
:
max_olen
]
labels
=
labels
[:,
:
max_olen
]
labels
[:,
-
1
]
=
1.0
# make sure at least one frame has 1
olens_in
=
olens
//
self
.
reduction_factor
else
:
olens_in
=
olens
need_dict
=
{}
need_dict
[
'encoder'
]
=
self
.
encoder
need_dict
[
'decoder'
]
=
self
.
decoder
...
...
@@ -462,7 +465,7 @@ class TransformerTTS(nn.Layer):
'num_layers_applied_guided_attn'
]
=
self
.
num_layers_applied_guided_attn
need_dict
[
'use_scaled_pos_enc'
]
=
self
.
use_scaled_pos_enc
return
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
ilens
,
need_dict
return
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
olens_in
,
need_dict
def
_forward
(
self
,
...
...
@@ -488,8 +491,7 @@ class TransformerTTS(nn.Layer):
# thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim)
if
self
.
reduction_factor
>
1
:
ys_in
=
ys
[:,
self
.
reduction_factor
-
1
::
self
.
reduction_factor
]
olens_in
=
olens
.
new
(
[
olen
//
self
.
reduction_factor
for
olen
in
olens
])
olens_in
=
olens
//
self
.
reduction_factor
else
:
ys_in
,
olens_in
=
ys
,
olens
...
...
@@ -769,318 +771,3 @@ class TransformerTTSInference(nn.Layer):
normalized_mel
=
self
.
acoustic_model
.
inference
(
text
)[
0
]
logmel
=
self
.
normalizer
.
inverse
(
normalized_mel
)
return
logmel
class
TransformerTTSLoss
(
nn
.
Layer
):
"""Loss function module for Tacotron2."""
def
__init__
(
self
,
use_masking
=
True
,
use_weighted_masking
=
False
,
bce_pos_weight
=
5.0
):
"""Initialize Tactoron2 loss module.
Parameters
----------
use_masking : bool
Whether to apply masking for padded part in loss calculation.
use_weighted_masking : bool
Whether to apply weighted masking in loss calculation.
bce_pos_weight : float
Weight of positive sample of stop token.
"""
super
().
__init__
()
assert
(
use_masking
!=
use_weighted_masking
)
or
not
use_masking
self
.
use_masking
=
use_masking
self
.
use_weighted_masking
=
use_weighted_masking
# define criterions
reduction
=
"none"
if
self
.
use_weighted_masking
else
"mean"
self
.
l1_criterion
=
nn
.
L1Loss
(
reduction
=
reduction
)
self
.
mse_criterion
=
nn
.
MSELoss
(
reduction
=
reduction
)
self
.
bce_criterion
=
nn
.
BCEWithLogitsLoss
(
reduction
=
reduction
,
pos_weight
=
paddle
.
to_tensor
(
bce_pos_weight
))
def
forward
(
self
,
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
):
"""Calculate forward propagation.
Parameters
----------
after_outs : Tensor
Batch of outputs after postnets (B, Lmax, odim).
before_outs : Tensor
Batch of outputs before postnets (B, Lmax, odim).
logits : Tensor
Batch of stop logits (B, Lmax).
ys : Tensor
Batch of padded target features (B, Lmax, odim).
labels : LongTensor
Batch of the sequences of stop token labels (B, Lmax).
olens : LongTensor
Batch of the lengths of each target (B,).
Returns
----------
Tensor
L1 loss value.
Tensor
Mean square error loss value.
Tensor
Binary cross entropy loss value.
"""
# make mask and apply it
if
self
.
use_masking
:
masks
=
make_non_pad_mask
(
olens
).
unsqueeze
(
-
1
)
ys
=
ys
.
masked_select
(
masks
.
broadcast_to
(
ys
.
shape
))
after_outs
=
after_outs
.
masked_select
(
masks
.
broadcast_to
(
after_outs
.
shape
))
before_outs
=
before_outs
.
masked_select
(
masks
.
broadcast_to
(
before_outs
.
shape
))
# Operator slice does not have kernel for data_type[bool]
tmp_masks
=
paddle
.
cast
(
masks
,
dtype
=
'int64'
)
tmp_masks
=
tmp_masks
[:,
:,
0
]
tmp_masks
=
paddle
.
cast
(
tmp_masks
,
dtype
=
'bool'
)
labels
=
labels
.
masked_select
(
tmp_masks
.
broadcast_to
(
labels
.
shape
))
logits
=
logits
.
masked_select
(
tmp_masks
.
broadcast_to
(
logits
.
shape
))
# calculate loss
l1_loss
=
self
.
l1_criterion
(
after_outs
,
ys
)
+
self
.
l1_criterion
(
before_outs
,
ys
)
mse_loss
=
self
.
mse_criterion
(
after_outs
,
ys
)
+
self
.
mse_criterion
(
before_outs
,
ys
)
bce_loss
=
self
.
bce_criterion
(
logits
,
labels
)
# make weighted mask and apply it
if
self
.
use_weighted_masking
:
masks
=
make_non_pad_mask
(
olens
).
unsqueeze
(
-
1
)
weights
=
masks
.
float
()
/
masks
.
sum
(
dim
=
1
,
keepdim
=
True
).
float
()
out_weights
=
weights
.
div
(
ys
.
shape
[
0
]
*
ys
.
shape
[
2
])
logit_weights
=
weights
.
div
(
ys
.
shape
[
0
])
# apply weight
l1_loss
=
l1_loss
.
multiply
(
out_weights
)
l1_loss
=
l1_loss
.
masked_select
(
masks
.
broadcast_to
(
l1_loss
.
shape
)).
sum
()
mse_loss
=
mse_loss
.
multiply
(
out_weights
)
mse_loss
=
mse_loss
.
masked_select
(
masks
.
broadcast_to
(
mse_loss
.
shape
)).
sum
()
bce_loss
=
bce_loss
.
multiply
(
logit_weights
.
squeeze
(
-
1
))
bce_loss
=
bce_loss
.
masked_select
(
masks
.
squeeze
(
-
1
).
broadcast_to
(
bce_loss
.
shape
)).
sum
()
return
l1_loss
,
mse_loss
,
bce_loss
class
GuidedAttentionLoss
(
nn
.
Layer
):
"""Guided attention loss function module.
This module calculates the guided attention loss described
in `Efficiently Trainable Text-to-Speech System Based
on Deep Convolutional Networks with Guided Attention`_,
which forces the attention to be diagonal.
.. _`Efficiently Trainable Text-to-Speech System
Based on Deep Convolutional Networks with Guided Attention`:
https://arxiv.org/abs/1710.08969
"""
def
__init__
(
self
,
sigma
=
0.4
,
alpha
=
1.0
,
reset_always
=
True
):
"""Initialize guided attention loss module.
Parameters
----------
sigma : float, optional
Standard deviation to control how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
super
(
GuidedAttentionLoss
,
self
).
__init__
()
self
.
sigma
=
sigma
self
.
alpha
=
alpha
self
.
reset_always
=
reset_always
self
.
guided_attn_masks
=
None
self
.
masks
=
None
def
_reset_masks
(
self
):
self
.
guided_attn_masks
=
None
self
.
masks
=
None
def
forward
(
self
,
att_ws
,
ilens
,
olens
):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of attention weights (B, T_max_out, T_max_in).
ilens : LongTensor
Batch of input lenghts (B,).
olens : LongTensor
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if
self
.
guided_attn_masks
is
None
:
self
.
guided_attn_masks
=
self
.
_make_guided_attention_masks
(
ilens
,
olens
)
if
self
.
masks
is
None
:
self
.
masks
=
self
.
_make_masks
(
ilens
,
olens
)
losses
=
self
.
guided_attn_masks
*
att_ws
loss
=
paddle
.
mean
(
losses
.
masked_select
(
self
.
masks
.
broadcast_to
(
losses
.
shape
)))
if
self
.
reset_always
:
self
.
_reset_masks
()
return
self
.
alpha
*
loss
def
_make_guided_attention_masks
(
self
,
ilens
,
olens
):
n_batches
=
len
(
ilens
)
max_ilen
=
max
(
ilens
)
max_olen
=
max
(
olens
)
guided_attn_masks
=
paddle
.
zeros
((
n_batches
,
max_olen
,
max_ilen
))
for
idx
,
(
ilen
,
olen
)
in
enumerate
(
zip
(
ilens
,
olens
)):
ilen
=
int
(
ilen
)
olen
=
int
(
olen
)
guided_attn_masks
[
idx
,
:
olen
,
:
ilen
]
=
self
.
_make_guided_attention_mask
(
ilen
,
olen
,
self
.
sigma
)
return
guided_attn_masks
@
staticmethod
def
_make_guided_attention_mask
(
ilen
,
olen
,
sigma
):
"""Make guided attention mask.
Examples
----------
>>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
>>> guided_attn_mask.shape
[5, 5]
>>> guided_attn_mask
tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
[0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
[0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
[0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
[0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
>>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
>>> guided_attn_mask.shape
[6, 3]
>>> guided_attn_mask
tensor([[0.0000, 0.2934, 0.7506],
[0.0831, 0.0831, 0.5422],
[0.2934, 0.0000, 0.2934],
[0.5422, 0.0831, 0.0831],
[0.7506, 0.2934, 0.0000],
[0.8858, 0.5422, 0.0831]])
"""
grid_x
,
grid_y
=
paddle
.
meshgrid
(
paddle
.
arange
(
olen
),
paddle
.
arange
(
ilen
))
grid_x
=
grid_x
.
cast
(
dtype
=
paddle
.
float32
)
grid_y
=
grid_y
.
cast
(
dtype
=
paddle
.
float32
)
return
1.0
-
paddle
.
exp
(
-
(
(
grid_y
/
ilen
-
grid_x
/
olen
)
**
2
)
/
(
2
*
(
sigma
**
2
)))
@
staticmethod
def
_make_masks
(
ilens
,
olens
):
"""Make masks indicating non-padded part.
Parameters
----------
ilens (LongTensor or List): Batch of lengths (B,).
olens (LongTensor or List): Batch of lengths (B,).
Returns
----------
Tensor
Mask tensor indicating non-padded part.
Examples
----------
>>> ilens, olens = [5, 2], [8, 5]
>>> _make_mask(ilens, olens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]], dtype=paddle.uint8)
"""
# (B, T_in)
in_masks
=
make_non_pad_mask
(
ilens
)
# (B, T_out)
out_masks
=
make_non_pad_mask
(
olens
)
# (B, T_out, T_in)
return
paddle
.
logical_and
(
out_masks
.
unsqueeze
(
-
1
),
in_masks
.
unsqueeze
(
-
2
))
class
GuidedMultiHeadAttentionLoss
(
GuidedAttentionLoss
):
"""Guided attention loss function module for multi head attention.
Parameters
----------
sigma : float, optional
Standard deviation to controlGuidedAttentionLoss
how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
def
forward
(
self
,
att_ws
,
ilens
,
olens
):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of multi head attention weights (B, H, T_max_out, T_max_in).
ilens : Tensor
Batch of input lenghts (B,).
olens : Tensor
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if
self
.
guided_attn_masks
is
None
:
self
.
guided_attn_masks
=
(
self
.
_make_guided_attention_masks
(
ilens
,
olens
).
unsqueeze
(
1
))
if
self
.
masks
is
None
:
self
.
masks
=
self
.
_make_masks
(
ilens
,
olens
).
unsqueeze
(
1
)
losses
=
self
.
guided_attn_masks
*
att_ws
loss
=
paddle
.
mean
(
losses
.
masked_select
(
self
.
masks
.
broadcast_to
(
losses
.
shape
)))
if
self
.
reset_always
:
self
.
_reset_masks
()
return
self
.
alpha
*
loss
paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
浏览文件 @
9c7f0762
...
...
@@ -17,8 +17,8 @@ from typing import Sequence
import
paddle
from
paddle
import
distributed
as
dist
from
paddlespeech.t2s.mod
els.transformer_tt
s
import
GuidedMultiHeadAttentionLoss
from
paddlespeech.t2s.mod
els.transformer_tts
import
TransformerTTSLoss
from
paddlespeech.t2s.mod
ules.losse
s
import
GuidedMultiHeadAttentionLoss
from
paddlespeech.t2s.mod
ules.losses
import
Tacotron2Loss
as
TransformerTTSLoss
from
paddlespeech.t2s.training.extensions.evaluator
import
StandardEvaluator
from
paddlespeech.t2s.training.reporter
import
report
from
paddlespeech.t2s.training.updaters.standard_updater
import
StandardUpdater
...
...
@@ -71,7 +71,7 @@ class TransformerTTSUpdater(StandardUpdater):
self
.
msg
=
"Rank: {}, "
.
format
(
dist
.
get_rank
())
losses_dict
=
{}
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
ilens
,
need_dict
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
olens_in
,
need_dict
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -116,7 +116,10 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_in, T_in)
att_ws
=
paddle
.
concat
(
att_ws
,
axis
=
1
)
enc_attn_loss
=
self
.
attn_criterion
(
att_ws
,
ilens
,
ilens
)
enc_attn_loss
=
self
.
attn_criterion
(
att_ws
=
att_ws
,
ilens
=
batch
[
"text_lengths"
]
+
1
,
olens
=
batch
[
"text_lengths"
]
+
1
)
loss
=
loss
+
enc_attn_loss
report
(
"train/enc_attn_loss"
,
float
(
enc_attn_loss
))
losses_dict
[
"enc_attn_loss"
]
=
float
(
enc_attn_loss
)
...
...
@@ -133,7 +136,8 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_out, T_out)
att_ws
=
paddle
.
concat
(
att_ws
,
axis
=
1
)
dec_attn_loss
=
self
.
attn_criterion
(
att_ws
,
olens
,
olens
)
dec_attn_loss
=
self
.
attn_criterion
(
att_ws
=
att_ws
,
ilens
=
olens_in
,
olens
=
olens_in
)
report
(
"train/dec_attn_loss"
,
float
(
dec_attn_loss
))
losses_dict
[
"dec_attn_loss"
]
=
float
(
dec_attn_loss
)
loss
=
loss
+
dec_attn_loss
...
...
@@ -150,7 +154,10 @@ class TransformerTTSUpdater(StandardUpdater):
break
# (B, H*L, T_out, T_in)
att_ws
=
paddle
.
concat
(
att_ws
,
axis
=
1
)
enc_dec_attn_loss
=
self
.
attn_criterion
(
att_ws
,
ilens
,
olens
)
enc_dec_attn_loss
=
self
.
attn_criterion
(
att_ws
=
att_ws
,
ilens
=
batch
[
"text_lengths"
]
+
1
,
olens
=
olens_in
)
report
(
"train/enc_dec_attn_loss"
,
float
(
enc_dec_attn_loss
))
losses_dict
[
"enc_dec_attn_loss"
]
=
float
(
enc_dec_attn_loss
)
loss
=
loss
+
enc_dec_attn_loss
...
...
@@ -215,7 +222,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
def
evaluate_core
(
self
,
batch
):
self
.
msg
=
"Evaluate: "
losses_dict
=
{}
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
ilens
,
need_dict
=
self
.
model
(
after_outs
,
before_outs
,
logits
,
ys
,
labels
,
olens
,
olens_in
,
need_dict
=
self
.
model
(
text
=
batch
[
"text"
],
text_lengths
=
batch
[
"text_lengths"
],
speech
=
batch
[
"speech"
],
...
...
@@ -260,7 +267,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_in, T_in)
att_ws
=
paddle
.
concat
(
att_ws
,
axis
=
1
)
enc_attn_loss
=
self
.
attn_criterion
(
att_ws
,
ilens
,
ilens
)
enc_attn_loss
=
self
.
attn_criterion
(
att_ws
=
att_ws
,
ilens
=
batch
[
"text_lengths"
]
+
1
,
olens
=
batch
[
"text_lengths"
]
+
1
)
loss
=
loss
+
enc_attn_loss
report
(
"train/enc_attn_loss"
,
float
(
enc_attn_loss
))
losses_dict
[
"enc_attn_loss"
]
=
float
(
enc_attn_loss
)
...
...
@@ -277,7 +287,8 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_out, T_out)
att_ws
=
paddle
.
concat
(
att_ws
,
axis
=
1
)
dec_attn_loss
=
self
.
attn_criterion
(
att_ws
,
olens
,
olens
)
dec_attn_loss
=
self
.
attn_criterion
(
att_ws
=
att_ws
,
ilens
=
olens_in
,
olens
=
olens_in
)
report
(
"eval/dec_attn_loss"
,
float
(
dec_attn_loss
))
losses_dict
[
"dec_attn_loss"
]
=
float
(
dec_attn_loss
)
loss
=
loss
+
dec_attn_loss
...
...
@@ -295,7 +306,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
break
# (B, H*L, T_out, T_in)
att_ws
=
paddle
.
concat
(
att_ws
,
axis
=
1
)
enc_dec_attn_loss
=
self
.
attn_criterion
(
att_ws
,
ilens
,
olens
)
enc_dec_attn_loss
=
self
.
attn_criterion
(
att_ws
=
att_ws
,
ilens
=
batch
[
"text_lengths"
]
+
1
,
olens
=
olens_in
)
report
(
"eval/enc_dec_attn_loss"
,
float
(
enc_dec_attn_loss
))
losses_dict
[
"enc_dec_attn_loss"
]
=
float
(
enc_dec_attn_loss
)
loss
=
loss
+
enc_dec_attn_loss
...
...
paddlespeech/t2s/modules/losses.py
浏览文件 @
9c7f0762
...
...
@@ -26,26 +26,30 @@ from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
# Loss for new Tacotron2
class
GuidedAttentionLoss
(
nn
.
Layer
):
"""Guided attention loss function module.
This module calculates the guided attention loss described
in `Efficiently Trainable Text-to-Speech System Based
on Deep Convolutional Networks with Guided Attention`_,
which forces the attention to be diagonal.
.. _`Efficiently Trainable Text-to-Speech System
Based on Deep Convolutional Networks with Guided Attention`:
https://arxiv.org/abs/1710.08969
"""
def
__init__
(
self
,
sigma
=
0.4
,
alpha
=
1.0
,
reset_always
=
True
):
"""Initialize guided attention loss module.
Parameters
----------
sigma : float, optional
Standard deviation to control
how close attention to a diagonal.
Standard deviation to control how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
super
().
__init__
()
self
.
sigma
=
sigma
...
...
@@ -60,18 +64,21 @@ class GuidedAttentionLoss(nn.Layer):
def
forward
(
self
,
att_ws
,
ilens
,
olens
):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of attention weights (B, T_max_out, T_max_in).
ilens : Tensor(int64)
Batch of input leng
th
s (B,).
Batch of input leng
ht
s (B,).
olens : Tensor(int64)
Batch of output lengths (B,).
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if
self
.
guided_attn_masks
is
None
:
self
.
guided_attn_masks
=
self
.
_make_guided_attention_masks
(
ilens
,
...
...
@@ -79,7 +86,8 @@ class GuidedAttentionLoss(nn.Layer):
if
self
.
masks
is
None
:
self
.
masks
=
self
.
_make_masks
(
ilens
,
olens
)
losses
=
self
.
guided_attn_masks
*
att_ws
loss
=
paddle
.
mean
(
losses
.
masked_select
(
self
.
masks
))
loss
=
paddle
.
mean
(
losses
.
masked_select
(
self
.
masks
.
broadcast_to
(
losses
.
shape
)))
if
self
.
reset_always
:
self
.
_reset_masks
()
return
self
.
alpha
*
loss
...
...
@@ -89,6 +97,7 @@ class GuidedAttentionLoss(nn.Layer):
max_ilen
=
max
(
ilens
)
max_olen
=
max
(
olens
)
guided_attn_masks
=
paddle
.
zeros
((
n_batches
,
max_olen
,
max_ilen
))
for
idx
,
(
ilen
,
olen
)
in
enumerate
(
zip
(
ilens
,
olens
)):
guided_attn_masks
[
idx
,
:
olen
,
:
ilen
]
=
self
.
_make_guided_attention_mask
(
...
...
@@ -98,11 +107,12 @@ class GuidedAttentionLoss(nn.Layer):
@
staticmethod
def
_make_guided_attention_mask
(
ilen
,
olen
,
sigma
):
"""Make guided attention mask.
Parameters
Examples
----------
>>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
>>> guided_attn_mask.shape
Size([5, 5])
[5, 5]
>>> guided_attn_mask
tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
[0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
...
...
@@ -111,7 +121,7 @@ class GuidedAttentionLoss(nn.Layer):
[0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
>>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
>>> guided_attn_mask.shape
Size([6, 3])
[6, 3]
>>> guided_attn_mask
tensor([[0.0000, 0.2934, 0.7506],
[0.0831, 0.0831, 0.5422],
...
...
@@ -119,55 +129,109 @@ class GuidedAttentionLoss(nn.Layer):
[0.5422, 0.0831, 0.0831],
[0.7506, 0.2934, 0.0000],
[0.8858, 0.5422, 0.0831]])
"""
grid_x
,
grid_y
=
paddle
.
meshgrid
(
paddle
.
arange
(
olen
),
paddle
.
arange
(
ilen
))
grid_x
=
paddle
.
cast
(
grid_x
,
dtype
=
'float32'
)
grid_y
=
paddle
.
cast
(
grid_y
,
dtype
=
'float32'
)
grid_x
=
grid_x
.
cast
(
dtype
=
paddle
.
float32
)
grid_y
=
grid_y
.
cast
(
dtype
=
paddle
.
float32
)
return
1.0
-
paddle
.
exp
(
-
(
(
grid_y
/
ilen
-
grid_x
/
olen
)
**
2
)
/
(
2
*
(
sigma
**
2
)))
@
staticmethod
def
_make_masks
(
ilens
,
olens
):
"""Make masks indicating non-padded part.
Examples
Parameters
----------
ilens : Tensor(int64) or List
Batch of lengths (B,).
olens : Tensor(int64) or List
Batch of lengths (B,).
Returns
----------
Tensor
Mask tensor indicating non-padded part.
Examples
----------
>>> ilens, olens = [5, 2], [8, 5]
>>> _make_mask(ilens, olens)
tensor([[[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]],
[[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]],)
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[1, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]]], dtype=paddle.uint8)
"""
# (B, T_in)
in_masks
=
make_non_pad_mask
(
ilens
)
# (B, T_out)
out_masks
=
make_non_pad_mask
(
olens
)
# (B, T_out, T_in)
return
out_masks
.
unsqueeze
(
-
1
)
&
in_masks
.
unsqueeze
(
-
2
)
return
paddle
.
logical_and
(
out_masks
.
unsqueeze
(
-
1
),
in_masks
.
unsqueeze
(
-
2
))
class
GuidedMultiHeadAttentionLoss
(
GuidedAttentionLoss
):
"""Guided attention loss function module for multi head attention.
Parameters
----------
sigma : float, optional
Standard deviation to controlGuidedAttentionLoss
how close attention to a diagonal.
alpha : float, optional
Scaling coefficient (lambda).
reset_always : bool, optional
Whether to always reset masks.
"""
def
forward
(
self
,
att_ws
,
ilens
,
olens
):
"""Calculate forward propagation.
Parameters
----------
att_ws : Tensor
Batch of multi head attention weights (B, H, T_max_out, T_max_in).
ilens : Tensor
Batch of input lenghts (B,).
olens : Tensor
Batch of output lenghts (B,).
Returns
----------
Tensor
Guided attention loss value.
"""
if
self
.
guided_attn_masks
is
None
:
self
.
guided_attn_masks
=
(
self
.
_make_guided_attention_masks
(
ilens
,
olens
).
unsqueeze
(
1
))
if
self
.
masks
is
None
:
self
.
masks
=
self
.
_make_masks
(
ilens
,
olens
).
unsqueeze
(
1
)
losses
=
self
.
guided_attn_masks
*
att_ws
loss
=
paddle
.
mean
(
losses
.
masked_select
(
self
.
masks
.
broadcast_to
(
losses
.
shape
)))
if
self
.
reset_always
:
self
.
_reset_masks
()
return
self
.
alpha
*
loss
class
Tacotron2Loss
(
nn
.
Layer
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录