update racotron2 and transformer tts, test=tts

9c7f0762 · 小湉湉 · 89e988a6 · 9c7f0762 · 9c7f0762 · 9c7f0762
5 changed file
--- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py
@@ -324,7 +324,10 @@ class Tacotron2(nn.Layer):
            ys = ys[:, :max_out]
            labels = labels[:, :max_out]
            labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0)
-        return after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens
+            olens_in = olens // self.reduction_factor
+        else:
+            olens_in = olens
+        return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in

    def _forward(
            self,

--- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
@@ -72,11 +72,10 @@ class Tacotron2Updater(StandardUpdater):
        # spk_id!=None in multiple spk fastspeech2 
        spk_id = batch["spk_id"] if "spk_id" in batch else None
        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
-        # No explicit speaker identifier labels are used during voice cloning training.
        if spk_emb is not None:
            spk_id = None

-        after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model(
+        after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -101,11 +100,8 @@ class Tacotron2Updater(StandardUpdater):
        if self.use_guided_attn_loss:
            # NOTE: length of output for auto-regressive
            # input will be changed when r > 1
-            if self.model.reduction_factor > 1:
-                olens_in = olens // self.model.reduction_factor
-            else:
-                olens_in = olens
-            attn_loss = self.attn_loss(att_ws, ilens, olens_in)
+            attn_loss = self.attn_loss(
+                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
            loss = loss + attn_loss

        optimizer = self.optimizer
@@ -169,7 +165,7 @@ class Tacotron2Evaluator(StandardEvaluator):
        if spk_emb is not None:
            spk_id = None

-        after_outs, before_outs, logits, ys, labels, olens, att_ws, ilens = self.model(
+        after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -194,11 +190,8 @@ class Tacotron2Evaluator(StandardEvaluator):
        if self.use_guided_attn_loss:
            # NOTE: length of output for auto-regressive
            # input will be changed when r > 1
-            if self.model.reduction_factor > 1:
-                olens_in = olens // self.model.reduction_factor
-            else:
-                olens_in = olens
-            attn_loss = self.attn_loss(att_ws, ilens, olens_in)
+            attn_loss = self.attn_loss(
+                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
            loss = loss + attn_loss

        report("eval/l1_loss", float(l1_loss))

--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -447,12 +447,15 @@ class TransformerTTS(nn.Layer):
        # modifiy mod part of groundtruth

        if self.reduction_factor > 1:
-            olens = paddle.to_tensor(
-                [olen - olen % self.reduction_factor for olen in olens.numpy()])
+            olens = olens - olens % self.reduction_factor
            max_olen = max(olens)
            ys = ys[:, :max_olen]
            labels = labels[:, :max_olen]
            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            olens_in = olens // self.reduction_factor
+        else:
+            olens_in = olens
+
        need_dict = {}
        need_dict['encoder'] = self.encoder
        need_dict['decoder'] = self.decoder
@@ -462,7 +465,7 @@ class TransformerTTS(nn.Layer):
            'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
        need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc

-        return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict
+        return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict

    def _forward(
            self,
@@ -488,8 +491,7 @@ class TransformerTTS(nn.Layer):
        # thin out frames for reduction factor (B, Lmax, odim) ->  (B, Lmax//r, odim)
        if self.reduction_factor > 1:
            ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor]
-            olens_in = olens.new(
-                [olen // self.reduction_factor for olen in olens])
+            olens_in = olens // self.reduction_factor
        else:
            ys_in, olens_in = ys, olens

@@ -769,318 +771,3 @@ class TransformerTTSInference(nn.Layer):
        normalized_mel = self.acoustic_model.inference(text)[0]
        logmel = self.normalizer.inverse(normalized_mel)
        return logmel
-
-
-class TransformerTTSLoss(nn.Layer):
-    """Loss function module for Tacotron2."""
-
-    def __init__(self,
-                 use_masking=True,
-                 use_weighted_masking=False,
-                 bce_pos_weight=5.0):
-        """Initialize Tactoron2 loss module.
-
-        Parameters
-        ----------
-        use_masking : bool
-            Whether to apply masking for padded part in loss calculation.
-        use_weighted_masking : bool
-            Whether to apply weighted masking in loss calculation.
-        bce_pos_weight : float
-            Weight of positive sample of stop token.
-
-        """
-        super().__init__()
-        assert (use_masking != use_weighted_masking) or not use_masking
-        self.use_masking = use_masking
-        self.use_weighted_masking = use_weighted_masking
-
-        # define criterions
-        reduction = "none" if self.use_weighted_masking else "mean"
-        self.l1_criterion = nn.L1Loss(reduction=reduction)
-        self.mse_criterion = nn.MSELoss(reduction=reduction)
-        self.bce_criterion = nn.BCEWithLogitsLoss(
-            reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))
-
-    def forward(self, after_outs, before_outs, logits, ys, labels, olens):
-        """Calculate forward propagation.
-
-        Parameters
-        ----------
-        after_outs : Tensor
-            Batch of outputs after postnets (B, Lmax, odim).
-        before_outs : Tensor
-            Batch of outputs before postnets (B, Lmax, odim).
-        logits : Tensor
-            Batch of stop logits (B, Lmax).
-        ys : Tensor
-            Batch of padded target features (B, Lmax, odim).
-        labels : LongTensor
-            Batch of the sequences of stop token labels (B, Lmax).
-        olens : LongTensor
-            Batch of the lengths of each target (B,).
-
-        Returns
-        ----------
-        Tensor
-            L1 loss value.
-        Tensor
-            Mean square error loss value.
-        Tensor
-            Binary cross entropy loss value.
-
-        """
-        # make mask and apply it
-        if self.use_masking:
-            masks = make_non_pad_mask(olens).unsqueeze(-1)
-            ys = ys.masked_select(masks.broadcast_to(ys.shape))
-            after_outs = after_outs.masked_select(
-                masks.broadcast_to(after_outs.shape))
-            before_outs = before_outs.masked_select(
-                masks.broadcast_to(before_outs.shape))
-            # Operator slice does not have kernel for data_type[bool]
-            tmp_masks = paddle.cast(masks, dtype='int64')
-            tmp_masks = tmp_masks[:, :, 0]
-            tmp_masks = paddle.cast(tmp_masks, dtype='bool')
-            labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape))
-            logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape))
-
-        # calculate loss
-        l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion(
-            before_outs, ys)
-        mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
-            before_outs, ys)
-        bce_loss = self.bce_criterion(logits, labels)
-
-        # make weighted mask and apply it
-        if self.use_weighted_masking:
-            masks = make_non_pad_mask(olens).unsqueeze(-1)
-            weights = masks.float() / masks.sum(dim=1, keepdim=True).float()
-            out_weights = weights.div(ys.shape[0] * ys.shape[2])
-            logit_weights = weights.div(ys.shape[0])
-
-            # apply weight
-            l1_loss = l1_loss.multiply(out_weights)
-            l1_loss = l1_loss.masked_select(
-                masks.broadcast_to(l1_loss.shape)).sum()
-
-            mse_loss = mse_loss.multiply(out_weights)
-            mse_loss = mse_loss.masked_select(
-                masks.broadcast_to(mse_loss.shape)).sum()
-
-            bce_loss = bce_loss.multiply(logit_weights.squeeze(-1))
-            bce_loss = bce_loss.masked_select(
-                masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum()
-
-        return l1_loss, mse_loss, bce_loss
-
-
-class GuidedAttentionLoss(nn.Layer):
-    """Guided attention loss function module.
-
-    This module calculates the guided attention loss described
-    in `Efficiently Trainable Text-to-Speech System Based
-    on Deep Convolutional Networks with Guided Attention`_,
-    which forces the attention to be diagonal.
-
-    .. _`Efficiently Trainable Text-to-Speech System
-        Based on Deep Convolutional Networks with Guided Attention`:
-        https://arxiv.org/abs/1710.08969
-
-    """
-
-    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
-        """Initialize guided attention loss module.
-
-        Parameters
-        ----------
-        sigma : float, optional
-            Standard deviation to control how close attention to a diagonal.
-        alpha : float, optional
-            Scaling coefficient (lambda).
-        reset_always : bool, optional
-            Whether to always reset masks.
-
-        """
-        super(GuidedAttentionLoss, self).__init__()
-        self.sigma = sigma
-        self.alpha = alpha
-        self.reset_always = reset_always
-        self.guided_attn_masks = None
-        self.masks = None
-
-    def _reset_masks(self):
-        self.guided_attn_masks = None
-        self.masks = None
-
-    def forward(self, att_ws, ilens, olens):
-        """Calculate forward propagation.
-
-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of attention weights (B, T_max_out, T_max_in).
-        ilens : LongTensor
-            Batch of input lenghts (B,).
-        olens : LongTensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
-
-        """
-        if self.guided_attn_masks is None:
-            self.guided_attn_masks = self._make_guided_attention_masks(ilens,
-                                                                       olens)
-        if self.masks is None:
-            self.masks = self._make_masks(ilens, olens)
-        losses = self.guided_attn_masks * att_ws
-        loss = paddle.mean(
-            losses.masked_select(self.masks.broadcast_to(losses.shape)))
-        if self.reset_always:
-            self._reset_masks()
-        return self.alpha * loss
-
-    def _make_guided_attention_masks(self, ilens, olens):
-        n_batches = len(ilens)
-        max_ilen = max(ilens)
-        max_olen = max(olens)
-        guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
-
-        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
-
-            ilen = int(ilen)
-            olen = int(olen)
-            guided_attn_masks[idx, :olen, :
-                              ilen] = self._make_guided_attention_mask(
-                                  ilen, olen, self.sigma)
-        return guided_attn_masks
-
-    @staticmethod
-    def _make_guided_attention_mask(ilen, olen, sigma):
-        """Make guided attention mask.
-
-        Examples
-        ----------
-        >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
-        >>> guided_attn_mask.shape
-        [5, 5]
-        >>> guided_attn_mask
-        tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
-                [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
-                [0.3935, 0.1175, 0.0000, 0.1175, 0.3935],
-                [0.6753, 0.3935, 0.1175, 0.0000, 0.1175],
-                [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
-        >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
-        >>> guided_attn_mask.shape
-        [6, 3]
-        >>> guided_attn_mask
-        tensor([[0.0000, 0.2934, 0.7506],
-                [0.0831, 0.0831, 0.5422],
-                [0.2934, 0.0000, 0.2934],
-                [0.5422, 0.0831, 0.0831],
-                [0.7506, 0.2934, 0.0000],
-                [0.8858, 0.5422, 0.0831]])
-
-        """
-        grid_x, grid_y = paddle.meshgrid(
-            paddle.arange(olen), paddle.arange(ilen))
-        grid_x = grid_x.cast(dtype=paddle.float32)
-        grid_y = grid_y.cast(dtype=paddle.float32)
-        return 1.0 - paddle.exp(-(
-            (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))
-
-    @staticmethod
-    def _make_masks(ilens, olens):
-        """Make masks indicating non-padded part.
-
-        Parameters
-        ----------
-        ilens (LongTensor or List): Batch of lengths (B,).
-        olens (LongTensor or List): Batch of lengths (B,).
-
-        Returns
-        ----------
-        Tensor 
-            Mask tensor indicating non-padded part.
-
-        Examples
-        ----------
-        >>> ilens, olens = [5, 2], [8, 5]
-        >>> _make_mask(ilens, olens)
-        tensor([[[1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1]],
-
-                [[1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [1, 1, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0],
-                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
-
-        """
-        # (B, T_in)
-        in_masks = make_non_pad_mask(ilens)
-        # (B, T_out)
-        out_masks = make_non_pad_mask(olens)
-        # (B, T_out, T_in)
-
-        return paddle.logical_and(
-            out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
-
-
-class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
-    """Guided attention loss function module for multi head attention.
-
-    Parameters
-    ----------
-    sigma : float, optional
-        Standard deviation to controlGuidedAttentionLoss
-        how close attention to a diagonal.
-    alpha : float, optional
-        Scaling coefficient (lambda).
-    reset_always : bool, optional
-        Whether to always reset masks.
-
-    """
-
-    def forward(self, att_ws, ilens, olens):
-        """Calculate forward propagation.
-
-        Parameters
-        ----------
-        att_ws : Tensor
-            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
-        ilens : Tensor
-            Batch of input lenghts (B,).
-        olens : Tensor
-            Batch of output lenghts (B,).
-
-        Returns
-        ----------
-        Tensor
-            Guided attention loss value.
-
-        """
-        if self.guided_attn_masks is None:
-            self.guided_attn_masks = (
-                self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
-        if self.masks is None:
-            self.masks = self._make_masks(ilens, olens).unsqueeze(1)
-        losses = self.guided_attn_masks * att_ws
-        loss = paddle.mean(
-            losses.masked_select(self.masks.broadcast_to(losses.shape)))
-        if self.reset_always:
-            self._reset_masks()
-
-        return self.alpha * loss
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
@@ -17,8 +17,8 @@ from typing import Sequence
 import paddle
 from paddle import distributed as dist

-from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss
-from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss
+from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss
+from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
@@ -71,7 +71,7 @@ class TransformerTTSUpdater(StandardUpdater):
        self.msg = "Rank: {}, ".format(dist.get_rank())
        losses_dict = {}

-        after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
+        after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -116,7 +116,10 @@ class TransformerTTSUpdater(StandardUpdater):
                        break
                # (B, H*L, T_in, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+                enc_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=batch["text_lengths"] + 1)
                loss = loss + enc_attn_loss
                report("train/enc_attn_loss", float(enc_attn_loss))
                losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@@ -133,7 +136,8 @@ class TransformerTTSUpdater(StandardUpdater):
                        break
                # (B, H*L, T_out, T_out)
                att_ws = paddle.concat(att_ws, axis=1)
-                dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
+                dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws, ilens=olens_in, olens=olens_in)
                report("train/dec_attn_loss", float(dec_attn_loss))
                losses_dict["dec_attn_loss"] = float(dec_attn_loss)
                loss = loss + dec_attn_loss
@@ -150,7 +154,10 @@ class TransformerTTSUpdater(StandardUpdater):
                        break
                # (B, H*L, T_out, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
+                enc_dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=olens_in)
                report("train/enc_dec_attn_loss", float(enc_dec_attn_loss))
                losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
                loss = loss + enc_dec_attn_loss
@@ -215,7 +222,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
    def evaluate_core(self, batch):
        self.msg = "Evaluate: "
        losses_dict = {}
-        after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model(
+        after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -260,7 +267,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
                        break
                # (B, H*L, T_in, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens)
+                enc_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=batch["text_lengths"] + 1)
                loss = loss + enc_attn_loss
                report("train/enc_attn_loss", float(enc_attn_loss))
                losses_dict["enc_attn_loss"] = float(enc_attn_loss)
@@ -277,7 +287,8 @@ class TransformerTTSEvaluator(StandardEvaluator):
                        break
                # (B, H*L, T_out, T_out)
                att_ws = paddle.concat(att_ws, axis=1)
-                dec_attn_loss = self.attn_criterion(att_ws, olens, olens)
+                dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws, ilens=olens_in, olens=olens_in)
                report("eval/dec_attn_loss", float(dec_attn_loss))
                losses_dict["dec_attn_loss"] = float(dec_attn_loss)
                loss = loss + dec_attn_loss
@@ -295,7 +306,10 @@ class TransformerTTSEvaluator(StandardEvaluator):
                        break
                # (B, H*L, T_out, T_in)
                att_ws = paddle.concat(att_ws, axis=1)
-                enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens)
+                enc_dec_attn_loss = self.attn_criterion(
+                    att_ws=att_ws,
+                    ilens=batch["text_lengths"] + 1,
+                    olens=olens_in)
                report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss))
                losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss)
                loss = loss + enc_dec_attn_loss

--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -26,26 +26,30 @@ from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 # Loss for new Tacotron2
 class GuidedAttentionLoss(nn.Layer):
    """Guided attention loss function module.
+
    This module calculates the guided attention loss described
    in `Efficiently Trainable Text-to-Speech System Based
    on Deep Convolutional Networks with Guided Attention`_,
    which forces the attention to be diagonal.
+
    .. _`Efficiently Trainable Text-to-Speech System
        Based on Deep Convolutional Networks with Guided Attention`:
        https://arxiv.org/abs/1710.08969
+
    """

    def __init__(self, sigma=0.4, alpha=1.0, reset_always=True):
        """Initialize guided attention loss module.
+
        Parameters
        ----------
        sigma : float, optional
-            Standard deviation to control
-            how close attention to a diagonal.
+            Standard deviation to control how close attention to a diagonal.
        alpha : float, optional
            Scaling coefficient (lambda).
        reset_always : bool, optional
            Whether to always reset masks.
+
        """
        super().__init__()
        self.sigma = sigma
@@ -60,18 +64,21 @@ class GuidedAttentionLoss(nn.Layer):

    def forward(self, att_ws, ilens, olens):
        """Calculate forward propagation.
+
        Parameters
        ----------
        att_ws : Tensor
            Batch of attention weights (B, T_max_out, T_max_in).
        ilens : Tensor(int64)
-            Batch of input lengths (B,).
+            Batch of input lenghts (B,).
        olens : Tensor(int64)
-            Batch of output lengths (B,).
+            Batch of output lenghts (B,).
+
        Returns
        ----------
        Tensor
            Guided attention loss value.
+
        """
        if self.guided_attn_masks is None:
            self.guided_attn_masks = self._make_guided_attention_masks(ilens,
@@ -79,7 +86,8 @@ class GuidedAttentionLoss(nn.Layer):
        if self.masks is None:
            self.masks = self._make_masks(ilens, olens)
        losses = self.guided_attn_masks * att_ws
-        loss = paddle.mean(losses.masked_select(self.masks))
+        loss = paddle.mean(
+            losses.masked_select(self.masks.broadcast_to(losses.shape)))
        if self.reset_always:
            self._reset_masks()
        return self.alpha * loss
@@ -89,6 +97,7 @@ class GuidedAttentionLoss(nn.Layer):
        max_ilen = max(ilens)
        max_olen = max(olens)
        guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen))
+
        for idx, (ilen, olen) in enumerate(zip(ilens, olens)):
            guided_attn_masks[idx, :olen, :
                              ilen] = self._make_guided_attention_mask(
@@ -98,11 +107,12 @@ class GuidedAttentionLoss(nn.Layer):
    @staticmethod
    def _make_guided_attention_mask(ilen, olen, sigma):
        """Make guided attention mask.
-        Parameters
+
+        Examples
        ----------
        >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4)
        >>> guided_attn_mask.shape
-        Size([5, 5])
+        [5, 5]
        >>> guided_attn_mask
        tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647],
                [0.1175, 0.0000, 0.1175, 0.3935, 0.6753],
@@ -111,7 +121,7 @@ class GuidedAttentionLoss(nn.Layer):
                [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]])
        >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4)
        >>> guided_attn_mask.shape
-        Size([6, 3])
+        [6, 3]
        >>> guided_attn_mask
        tensor([[0.0000, 0.2934, 0.7506],
                [0.0831, 0.0831, 0.5422],
@@ -119,55 +129,109 @@ class GuidedAttentionLoss(nn.Layer):
                [0.5422, 0.0831, 0.0831],
                [0.7506, 0.2934, 0.0000],
                [0.8858, 0.5422, 0.0831]])
+
        """
        grid_x, grid_y = paddle.meshgrid(
            paddle.arange(olen), paddle.arange(ilen))
-        grid_x = paddle.cast(grid_x, dtype='float32')
-        grid_y = paddle.cast(grid_y, dtype='float32')
-
+        grid_x = grid_x.cast(dtype=paddle.float32)
+        grid_y = grid_y.cast(dtype=paddle.float32)
        return 1.0 - paddle.exp(-(
            (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2)))

    @staticmethod
    def _make_masks(ilens, olens):
        """Make masks indicating non-padded part.
-        Examples
+
+        Parameters
        ----------
        ilens : Tensor(int64) or List
            Batch of lengths (B,).
        olens : Tensor(int64) or List
            Batch of lengths (B,).
+
        Returns
        ----------
        Tensor
            Mask tensor indicating non-padded part.
+
        Examples
        ----------
        >>> ilens, olens = [5, 2], [8, 5]
        >>> _make_mask(ilens, olens)
        tensor([[[1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1],
-                    [1, 1, 1, 1, 1]],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1]],
+
                [[1, 1, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [1, 1, 0, 0, 0],
-                    [0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0]]],)
+                [1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [1, 1, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0],
+                [0, 0, 0, 0, 0]]], dtype=paddle.uint8)
+
        """
        # (B, T_in)
        in_masks = make_non_pad_mask(ilens)
        # (B, T_out)
        out_masks = make_non_pad_mask(olens)
        # (B, T_out, T_in)
-        return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
+
+        return paddle.logical_and(
+            out_masks.unsqueeze(-1), in_masks.unsqueeze(-2))
+
+
+class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss):
+    """Guided attention loss function module for multi head attention.
+
+    Parameters
+    ----------
+    sigma : float, optional
+        Standard deviation to controlGuidedAttentionLoss
+        how close attention to a diagonal.
+    alpha : float, optional
+        Scaling coefficient (lambda).
+    reset_always : bool, optional
+        Whether to always reset masks.
+
+    """
+
+    def forward(self, att_ws, ilens, olens):
+        """Calculate forward propagation.
+
+        Parameters
+        ----------
+        att_ws : Tensor
+            Batch of multi head attention weights (B, H, T_max_out, T_max_in).
+        ilens : Tensor
+            Batch of input lenghts (B,).
+        olens : Tensor
+            Batch of output lenghts (B,).
+
+        Returns
+        ----------
+        Tensor
+            Guided attention loss value.
+
+        """
+        if self.guided_attn_masks is None:
+            self.guided_attn_masks = (
+                self._make_guided_attention_masks(ilens, olens).unsqueeze(1))
+        if self.masks is None:
+            self.masks = self._make_masks(ilens, olens).unsqueeze(1)
+        losses = self.guided_attn_masks * att_ws
+        loss = paddle.mean(
+            losses.masked_select(self.masks.broadcast_to(losses.shape)))
+        if self.reset_always:
+            self._reset_masks()
+
+        return self.alpha * loss


 class Tacotron2Loss(nn.Layer):