fix yamls, change labels to stop_labels, test=tts

96323816 · 小湉湉 · 1bf1a876 · 96323816 · 96323816 · 96323816
14 changed file
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################

--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################

--- a/examples/csmsc/tts0/conf/default.yaml
+++ b/examples/csmsc/tts0/conf/default.yaml
@@ -21,10 +21,6 @@ fmin: 80           # Minimum frequency of Mel basis.
 fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

-# Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
-
 ###########################################################
 #                       DATA SETTING                      #
 ###########################################################

--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################

--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################

--- a/examples/ljspeech/tts3/conf/default.yaml
+++ b/examples/ljspeech/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################

--- a/examples/vctk/tts3/conf/default.yaml
+++ b/examples/vctk/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.

 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.


 ###########################################################

--- a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py
@@ -27,9 +27,7 @@ import tqdm
 import yaml
 from yacs.config import CfgNode

-from paddlespeech.t2s.data.get_feats import Energy
 from paddlespeech.t2s.data.get_feats import LogMelFBank
-from paddlespeech.t2s.data.get_feats import Pitch
 from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
 from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
@@ -42,8 +40,6 @@ def process_sentence(config: Dict[str, Any],
                     sentences: Dict,
                     output_dir: Path,
                     mel_extractor=None,
-                     pitch_extractor=None,
-                     energy_extractor=None,
                     cut_sil: bool=True,
                     spk_emb_dir: Path=None):
    utt_id = fp.stem
@@ -117,8 +113,6 @@ def process_sentences(config,
                      sentences: Dict,
                      output_dir: Path,
                      mel_extractor=None,
-                      pitch_extractor=None,
-                      energy_extractor=None,
                      nprocs: int=1,
                      cut_sil: bool=True,
                      spk_emb_dir: Path=None):
@@ -126,8 +120,7 @@ def process_sentences(config,
        results = []
        for fp in fps:
            record = process_sentence(config, fp, sentences, output_dir,
-                                      mel_extractor, pitch_extractor,
-                                      energy_extractor, cut_sil, spk_emb_dir)
+                                      mel_extractor, cut_sil, spk_emb_dir)
            if record:
                results.append(record)
    else:
@@ -137,7 +130,6 @@ def process_sentences(config,
                for fp in fps:
                    future = pool.submit(process_sentence, config, fp,
                                         sentences, output_dir, mel_extractor,
-                                         pitch_extractor, energy_extractor,
                                         cut_sil, spk_emb_dir)
                    future.add_done_callback(lambda p: progress.update())
                    futures.append(future)
@@ -299,17 +291,6 @@ def main():
        n_mels=config.n_mels,
        fmin=config.fmin,
        fmax=config.fmax)
-    pitch_extractor = Pitch(
-        sr=config.fs,
-        hop_length=config.n_shift,
-        f0min=config.f0min,
-        f0max=config.f0max)
-    energy_extractor = Energy(
-        sr=config.fs,
-        n_fft=config.n_fft,
-        hop_length=config.n_shift,
-        win_length=config.win_length,
-        window=config.window)

    # process for the 3 sections
    if train_wav_files:
@@ -319,8 +300,6 @@ def main():
            sentences,
            train_dump_dir,
            mel_extractor,
-            pitch_extractor,
-            energy_extractor,
            nprocs=args.num_cpu,
            cut_sil=args.cut_sil,
            spk_emb_dir=spk_emb_dir)
@@ -331,8 +310,6 @@ def main():
            sentences,
            dev_dump_dir,
            mel_extractor,
-            pitch_extractor,
-            energy_extractor,
            cut_sil=args.cut_sil,
            spk_emb_dir=spk_emb_dir)
    if test_wav_files:
@@ -342,8 +319,6 @@ def main():
            sentences,
            test_dump_dir,
            mel_extractor,
-            pitch_extractor,
-            energy_extractor,
            nprocs=args.num_cpu,
            cut_sil=args.cut_sil,
            spk_emb_dir=spk_emb_dir)

--- a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py
@@ -300,10 +300,10 @@ class Tacotron2(nn.Layer):
        olens = speech_lengths

        # make labels for stop prediction
-        labels = make_pad_mask(olens - 1)
+        stop_labels = make_pad_mask(olens - 1)
        # bool 类型无法切片
-        labels = paddle.cast(labels, dtype='float32')
-        labels = F.pad(labels, [0, 0, 0, 1], "constant", 1.0)
+        stop_labels = paddle.cast(stop_labels, dtype='float32')
+        stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)

        # calculate tacotron2 outputs
        after_outs, before_outs, logits, att_ws = self._forward(
@@ -322,12 +322,13 @@ class Tacotron2(nn.Layer):
            olens = olens - olens % self.reduction_factor
            max_out = max(olens)
            ys = ys[:, :max_out]
-            labels = labels[:, :max_out]
-            labels = paddle.scatter(labels, 1, (olens - 1).unsqueeze(1), 1.0)
+            stop_labels = stop_labels[:, :max_out]
+            stop_labels = paddle.scatter(stop_labels, 1,
+                                         (olens - 1).unsqueeze(1), 1.0)
            olens_in = olens // self.reduction_factor
        else:
            olens_in = olens
-        return after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in
+        return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in

    def _forward(
            self,

--- a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py
@@ -74,7 +74,7 @@ class Tacotron2Updater(StandardUpdater):
        if spk_emb is not None:
            spk_id = None

-        after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
+        after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -83,8 +83,13 @@ class Tacotron2Updater(StandardUpdater):
            spk_emb=spk_emb)

        # calculate taco2 loss
-        l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs,
-                                                      logits, ys, labels, olens)
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)

        if self.loss_type == "L1+L2":
            loss = l1_loss + mse_loss + bce_loss
@@ -164,7 +169,7 @@ class Tacotron2Evaluator(StandardEvaluator):
        if spk_emb is not None:
            spk_id = None

-        after_outs, before_outs, logits, ys, labels, olens, att_ws, olens_in = self.model(
+        after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -173,8 +178,13 @@ class Tacotron2Evaluator(StandardEvaluator):
            spk_emb=spk_emb)

        # calculate taco2 loss
-        l1_loss, mse_loss, bce_loss = self.taco2_loss(after_outs, before_outs,
-                                                      logits, ys, labels, olens)
+        l1_loss, mse_loss, bce_loss = self.taco2_loss(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            logits=logits,
+            ys=ys,
+            stop_labels=stop_labels,
+            olens=olens)

        if self.loss_type == "L1+L2":
            loss = l1_loss + mse_loss + bce_loss

--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
@@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer):
        olens = paddle.cast(speech_lengths, 'int64')

        # make labels for stop prediction
-        labels = make_pad_mask(olens - 1)
-        labels = numpy.pad(
-            labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0)
-        labels = paddle.to_tensor(labels)
-        labels = paddle.cast(labels, dtype="float32")
-        # labels = F.pad(labels, [0, 1], "constant", 1.0)
+        stop_labels = make_pad_mask(olens - 1)
+        # bool 类型无法切片
+        stop_labels = paddle.cast(stop_labels, dtype='float32')
+        stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0)

        # calculate transformer outputs
        after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens,
@@ -450,8 +448,8 @@ class TransformerTTS(nn.Layer):
            olens = olens - olens % self.reduction_factor
            max_olen = max(olens)
            ys = ys[:, :max_olen]
-            labels = labels[:, :max_olen]
-            labels[:, -1] = 1.0  # make sure at least one frame has 1
+            stop_labels = stop_labels[:, :max_olen]
+            stop_labels[:, -1] = 1.0  # make sure at least one frame has 1
            olens_in = olens // self.reduction_factor
        else:
            olens_in = olens
@@ -465,7 +463,7 @@ class TransformerTTS(nn.Layer):
            'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn
        need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc

-        return after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict
+        return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict

    def _forward(
            self,

--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
@@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater):
        self.msg = "Rank: {}, ".format(dist.get_rank())
        losses_dict = {}

-        after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model(
+        after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater):
            before_outs=before_outs,
            logits=logits,
            ys=ys,
-            labels=labels,
+            stop_labels=stop_labels,
            olens=olens)

        report("train/bce_loss", float(bce_loss))
@@ -226,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
    def evaluate_core(self, batch):
        self.msg = "Evaluate: "
        losses_dict = {}
-        after_outs, before_outs, logits, ys, labels, olens, olens_in, need_dict = self.model(
+        after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model(
            text=batch["text"],
            text_lengths=batch["text_lengths"],
            speech=batch["speech"],
@@ -237,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator):
            before_outs=before_outs,
            logits=logits,
            ys=ys,
-            labels=labels,
+            stop_labels=stop_labels,
            olens=olens)

        report("eval/bce_loss", float(bce_loss))

--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -263,7 +263,7 @@ class Tacotron2Loss(nn.Layer):
        self.bce_criterion = nn.BCEWithLogitsLoss(
            reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight))

-    def forward(self, after_outs, before_outs, logits, ys, labels, olens):
+    def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens):
        """Calculate forward propagation.
        Parameters
        ----------
@@ -275,7 +275,7 @@ class Tacotron2Loss(nn.Layer):
            Batch of stop logits (B, Lmax).
        ys : Tensor
            Batch of padded target features (B, Lmax, odim).
-        labels : Tensor(int64)
+        stop_labels : Tensor(int64)
            Batch of the sequences of stop token labels (B, Lmax).
        olens : Tensor(int64)
            Batch of the lengths of each target (B,).
@@ -296,8 +296,8 @@ class Tacotron2Loss(nn.Layer):
                masks.broadcast_to(after_outs.shape))
            before_outs = before_outs.masked_select(
                masks.broadcast_to(before_outs.shape))
-            labels = labels.masked_select(
-                masks[:, :, 0].broadcast_to(labels.shape))
+            stop_labels = stop_labels.masked_select(
+                masks[:, :, 0].broadcast_to(stop_labels.shape))
            logits = logits.masked_select(
                masks[:, :, 0].broadcast_to(logits.shape))

@@ -306,7 +306,7 @@ class Tacotron2Loss(nn.Layer):
            before_outs, ys)
        mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion(
            before_outs, ys)
-        bce_loss = self.bce_criterion(logits, labels)
+        bce_loss = self.bce_criterion(logits, stop_labels)

        # make weighted mask and apply it
        if self.use_weighted_masking:

--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@@ -207,7 +207,7 @@ class AttLoc(nn.Layer):

        w = F.softmax(scaling * e, axis=1)

-        # weighted sum over flames
+        # weighted sum over frames
        # utt x hdim
        c = paddle.sum(
            self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1)