Revert "[TTS]add multi-band melgan finetune scripts"

9f054e5a · xiegegege · GitHub · 26258949 · 9f054e5a · 26258949
7 changed file
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
+from typing import Union
 import numpy as np
 import paddle
@@ -22,12 +23,129 @@ from yacs.config import CfgNode
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
 from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
 from paddlespeech.t2s.models.parallel_wavegan import PWGInference
 from paddlespeech.t2s.modules.normalizer import ZScore
+class StyleFastSpeech2Inference(FastSpeech2Inference):
+    def __init__(self, normalizer, model, pitch_stats_path, energy_stats_path):
+        super().__init__(normalizer, model)
+        pitch_mean, pitch_std = np.load(pitch_stats_path)
+        self.pitch_mean = paddle.to_tensor(pitch_mean)
+        self.pitch_std = paddle.to_tensor(pitch_std)
+        energy_mean, energy_std = np.load(energy_stats_path)
+        self.energy_mean = paddle.to_tensor(energy_mean)
+        self.energy_std = paddle.to_tensor(energy_std)
+    def denorm(self, data, mean, std):
+        return data * std + mean
+    def norm(self, data, mean, std):
+        return (data - mean) / std
+    def forward(self,
+                text: paddle.Tensor,
+                durations: Union[paddle.Tensor, np.ndarray]=None,
+                durations_scale: Union[int, float]=None,
+                durations_bias: Union[int, float]=None,
+                pitch: Union[paddle.Tensor, np.ndarray]=None,
+                pitch_scale: Union[int, float]=None,
+                pitch_bias: Union[int, float]=None,
+                energy: Union[paddle.Tensor, np.ndarray]=None,
+                energy_scale: Union[int, float]=None,
+                energy_bias: Union[int, float]=None,
+                robot: bool=False):
+        """
+        Parameters
+        ----------
+        text : Tensor(int64)
+            Input sequence of characters (T,).
+        speech : Tensor, optional
+            Feature sequence to extract style (N, idim).
+        durations : paddle.Tensor/np.ndarray, optional (int64)
+            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
+        durations_scale: int/float, optional
+        durations_bias: int/float, optional
+        pitch : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
+        pitch_scale: int/float, optional
+            In denormed HZ domain.
+        pitch_bias: int/float, optional
+            In denormed HZ domain.
+        energy : paddle.Tensor/np.ndarray, optional
+            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
+        energy_scale: int/float, optional
+            In denormed domain.
+        energy_bias: int/float, optional
+            In denormed domain.
+        robot : bool, optional
+            Weather output robot style
+        Returns
+        ----------
+        Tensor
+            Output sequence of features (L, odim).
+        """
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text, durations=None, pitch=None, energy=None)
+        # priority: groundtruth > scale/bias > previous output
+        # set durations
+        if isinstance(durations, np.ndarray):
+            durations = paddle.to_tensor(durations)
+        elif isinstance(durations, paddle.Tensor):
+            durations = durations
+        elif durations_scale or durations_bias:
+            durations_scale = durations_scale if durations_scale is not None else 1
+            durations_bias = durations_bias if durations_bias is not None else 0
+            durations = durations_scale * d_outs + durations_bias
+        else:
+            durations = d_outs
+        if robot:
+            # set normed pitch to zeros have the same effect with set denormd ones to mean
+            pitch = paddle.zeros(p_outs.shape)
+        # set pitch, can overwrite robot set  
+        if isinstance(pitch, np.ndarray):
+            pitch = paddle.to_tensor(pitch)
+        elif isinstance(pitch, paddle.Tensor):
+            pitch = pitch
+        elif pitch_scale or pitch_bias:
+            pitch_scale = pitch_scale if pitch_scale is not None else 1
+            pitch_bias = pitch_bias if pitch_bias is not None else 0
+            p_Hz = paddle.exp(
+                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
+            p_HZ = pitch_scale * p_Hz + pitch_bias
+            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
+        else:
+            pitch = p_outs
+        # set energy
+        if isinstance(energy, np.ndarray):
+            energy = paddle.to_tensor(energy)
+        elif isinstance(energy, paddle.Tensor):
+            energy = energy
+        elif energy_scale or energy_bias:
+            energy_scale = energy_scale if energy_scale is not None else 1
+            energy_bias = energy_bias if energy_bias is not None else 0
+            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
+            e_dnorm = energy_scale * e_dnorm + energy_bias
+            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
+        else:
+            energy = e_outs
+        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
+            text,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            use_teacher_forcing=True)
+        logmel = self.normalizer.inverse(normalized_mel)
+        return logmel
 def evaluate(args, fastspeech2_config, pwg_config):
    # construct dataset for evaluation

--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
-# This is the hyperparameter configuration file for MelGAN.
-# Please make sure this is adjusted for the CSMSC dataset. If you want to
-# apply to the other dataset, you might need to carefully change some parameters.
-# This configuration requires ~ 8GB memory and will finish within 7 days on Titan V.
-# This configuration is based on full-band MelGAN but the hop size and sampling
-# rate is different from the paper (16kHz vs 24kHz). The number of iteraions
-# is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
-###########################################################
-#                FEATURE EXTRACTION SETTING               #
-###########################################################
-fs: 24000                # Sampling rate.
-n_fft: 2048              # FFT size. (in samples)
-n_shift: 300             # Hop size. (in samples)
-win_length: 1200         # Window length. (in samples)
-                         # If set to null, it will be the same as fft_size.
-window: "hann"           # Window function.
-n_mels: 80               # Number of mel basis.
-fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
-fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
-###########################################################
-#         GENERATOR NETWORK ARCHITECTURE SETTING          #
-###########################################################
-generator_params:
-    in_channels: 80               # Number of input channels.
-    out_channels: 4               # Number of output channels.
-    kernel_size: 7                # Kernel size of initial and final conv layers.
-    channels: 384                 # Initial number of channels for conv layers.
-    upsample_scales: [5, 5, 3]    # List of Upsampling scales.
-    stack_kernel_size: 3          # Kernel size of dilated conv layers in residual stack.
-    stacks: 4                     # Number of stacks in a single residual stack module.
-    use_weight_norm: True         # Whether to use weight normalization.
-    use_causal_conv: False        # Whether to use causal convolution.
-    use_final_nonlinear_activation: True
-###########################################################
-#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
-###########################################################
-discriminator_params:
-    in_channels: 1                    # Number of input channels.
-    out_channels: 1                   # Number of output channels.
-    scales: 3                         # Number of multi-scales.
-    downsample_pooling: "AvgPool1D"   # Pooling type for the input downsampling.
-    downsample_pooling_params:        # Parameters of the above pooling function.
-        kernel_size: 4
-        stride: 2
-        padding: 1
-        exclusive: True
-    kernel_sizes: [5, 3]              # List of kernel size.
-    channels: 16                      # Number of channels of the initial conv layer.
-    max_downsample_channels: 512      # Maximum number of channels of downsampling layers.
-    downsample_scales: [4, 4, 4]      # List of downsampling scales.
-    nonlinear_activation: "LeakyReLU" # Nonlinear activation function.
-    nonlinear_activation_params:      # Parameters of nonlinear activation function.
-        negative_slope: 0.2
-    use_weight_norm: True             # Whether to use weight norm.
-###########################################################
-#                   STFT LOSS SETTING                     #
-###########################################################
-use_stft_loss: true
-stft_loss_params:
-    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
-    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
-    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
-    window: "hann"                # Window function for STFT-based loss
-use_subband_stft_loss: true
-subband_stft_loss_params:
-    fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
-    hop_sizes: [30, 60, 10]     # List of hop size for STFT-based loss
-    win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
-    window: "hann"              # Window function for STFT-based loss
-###########################################################
-#               ADVERSARIAL LOSS SETTING                  #
-###########################################################
-use_feat_match_loss: false # Whether to use feature matching loss.
-lambda_adv: 2.5            # Loss balancing coefficient for adversarial loss.
-###########################################################
-#                  DATA LOADER SETTING                    #
-###########################################################
-batch_size: 64             # Batch size.
-batch_max_steps: 16200     # Length of each audio in batch. Make sure dividable by hop_size.
-num_workers: 2             # Number of workers in DataLoader.
-###########################################################
-#             OPTIMIZER & SCHEDULER SETTING               #
-###########################################################
-generator_optimizer_params:
-    epsilon: 1.0e-7                     # Generator's epsilon.
-    weight_decay: 0.0                   # Generator's weight decay coefficient.
-generator_grad_norm: -1                 # Generator's gradient norm.
-generator_scheduler_params:
-    learning_rate: 1.0e-3               # Generator's learning rate.
-    gamma: 0.5                          # Generator's scheduler gamma.
-    milestones:                         # At each milestone, lr will be multiplied by gamma.
-        - 100000
-        - 200000
-        - 300000
-        - 400000
-        - 500000
-        - 600000
-discriminator_optimizer_params:
-    epsilon: 1.0e-7                          # Discriminator's epsilon.
-    weight_decay: 0.0                       # Discriminator's weight decay coefficient.
-discriminator_grad_norm: -1                 # Discriminator's gradient norm.
-discriminator_scheduler_params:
-    learning_rate: 1.0e-3                   # Discriminator's learning rate.
-    gamma: 0.5                              # Discriminator's scheduler gamma.
-    milestones:                             # At each milestone, lr will be multiplied by gamma.
-        - 100000
-        - 200000
-        - 300000
-        - 400000
-        - 500000
-        - 600000
-###########################################################
-#                    INTERVAL SETTING                     #
-###########################################################
-discriminator_train_start_steps: 200000 # Number of steps to start to train discriminator.
-train_max_steps: 1200000                # Number of training steps.
-save_interval_steps: 1000              # Interval steps to save checkpoint.
-eval_interval_steps: 1000               # Interval steps to evaluate the network.
-###########################################################
-#                     OTHER SETTING                       #
-###########################################################
-num_snapshots: 10                 # max number of snapshots to keep while training
-seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
-#!/bin/bash
-source path.sh
-gpus=0
-stage=0
-stop_stage=100
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-  python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
-      --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
-      --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
-      --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
-      --dur-file=durations.txt \
-      --output-dir=dump_finetune \
-      --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
-fi
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-  python3 local/link_wav.py \
-    --old-dump-dir=dump \
-    --dump-dir=dump_finetune 
-fi
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    cp dump/train/feats_stats.npy dump_finetune/train/
-fi
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/train/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/dev/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/test/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-fi
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-  CUDA_VISIBLE_DEVICES=${gpus} \
-  FLAGS_cudnn_exhaustive_search=true \
-  FLAGS_conv_workspace_size_limit=4000 \
-  python ${BIN_DIR}/train.py \
-      --train-metadata=dump_finetune/train/norm/metadata.jsonl \
-      --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
-      --config=conf/finetune.yaml \
-      --output-dir=exp/finetune \
-      --ngpu=1
-fi 
\ No newline at end of file
--- a/examples/csmsc/voc3/local/link_wav.py
+++ b/examples/csmsc/voc3/local/link_wav.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from operator import itemgetter
-from pathlib import Path
-import jsonlines
-import numpy as np
-def main():
-    # parse config and args
-    parser = argparse.ArgumentParser(
-        description="Preprocess audio and then extract features .")
-    parser.add_argument(
-        "--old-dump-dir",
-        default=None,
-        type=str,
-        help="directory to dump feature files.")
-    parser.add_argument(
-        "--dump-dir",
-        type=str,
-        required=True,
-        help="directory to finetune dump feature files.")
-    args = parser.parse_args()
-    old_dump_dir = Path(args.old_dump_dir).expanduser()
-    old_dump_dir = old_dump_dir.resolve()
-    dump_dir = Path(args.dump_dir).expanduser()
-    # use absolute path
-    dump_dir = dump_dir.resolve()
-    dump_dir.mkdir(parents=True, exist_ok=True)
-    assert old_dump_dir.is_dir()
-    assert dump_dir.is_dir()
-    for sub in ["train", "dev", "test"]:
-        # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
-        output_dir = dump_dir / sub
-        output_dir.mkdir(parents=True, exist_ok=True)
-        results = []
-        for name in os.listdir(output_dir / "raw"):
-            # 003918_feats.npy
-            utt_id = name.split("_")[0]
-            mel_path = output_dir / ("raw/" + name)
-            gen_mel = np.load(mel_path)
-            wave_name = utt_id + "_wave.npy"
-            wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
-            os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
-                       output_dir / ("raw/" + wave_name))
-            num_sample = wav.shape[0]
-            num_frames = gen_mel.shape[0]
-            wav_path = output_dir / ("raw/" + wave_name)
-            record = {
-                "utt_id": utt_id,
-                "num_samples": num_sample,
-                "num_frames": num_frames,
-                "feats": str(mel_path),
-                "wave": str(wav_path),
-            }
-            results.append(record)
-        results.sort(key=itemgetter("utt_id"))
-        with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
-            for item in results:
-                writer.write(item)
-if __name__ == "__main__":
-    main()
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -110,10 +110,10 @@ class Clip(object):
        if len(x) < c.shape[0] * self.hop_size:
            x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
        elif len(x) > c.shape[0] * self.hop_size:
-            # print(
+            print(
-            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
+                f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
-            # )
+            )
-            x = x[:c.shape[0] * self.hop_size]
+            x = x[:c.shape[1] * self.hop_size]
        # check the legnth is valid
        assert len(x) == c.shape[

--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# generate mels using durations.txt
-# for mb melgan finetune
-# 长度和原本的 mel 不一致怎么办？
-import argparse
-from pathlib import Path
-import numpy as np
-import paddle
-import yaml
-from yacs.config import CfgNode
-from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
-from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
-from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
-from paddlespeech.t2s.models.fastspeech2 import StyleFastSpeech2Inference
-from paddlespeech.t2s.modules.normalizer import ZScore
-def evaluate(args, fastspeech2_config):
-    # construct dataset for evaluation
-    with open(args.phones_dict, "r") as f:
-        phn_id = [line.strip().split() for line in f.readlines()]
-    vocab_size = len(phn_id)
-    print("vocab_size:", vocab_size)
-    phone_dict = {}
-    for phn, id in phn_id:
-        phone_dict[phn] = int(id)
-    odim = fastspeech2_config.n_mels
-    model = FastSpeech2(
-        idim=vocab_size, odim=odim, **fastspeech2_config["model"])
-    model.set_state_dict(
-        paddle.load(args.fastspeech2_checkpoint)["main_params"])
-    model.eval()
-    stat = np.load(args.fastspeech2_stat)
-    mu, std = stat
-    mu = paddle.to_tensor(mu)
-    std = paddle.to_tensor(std)
-    fastspeech2_normalizer = ZScore(mu, std)
-    fastspeech2_inference = StyleFastSpeech2Inference(fastspeech2_normalizer,
-                                                      model)
-    fastspeech2_inference.eval()
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    sentences, speaker_set = get_phn_dur(args.dur_file)
-    merge_silence(sentences)
-    for i, utt_id in enumerate(sentences):
-        phones = sentences[utt_id][0]
-        durations = sentences[utt_id][1]
-        speaker = sentences[utt_id][2]
-        # 裁剪掉开头和结尾的 sil
-        if args.cut_sil:
-            if phones[0] == "sil" and len(durations) > 1:
-                durations = durations[1:]
-                phones = phones[1:]
-            if phones[-1] == 'sil' and len(durations) > 1:
-                durations = durations[:-1]
-                phones = phones[:-1]
-            # sentences[utt_id][0] = phones
-            # sentences[utt_id][1] = durations
-        phone_ids = [phone_dict[phn] for phn in phones]
-        phone_ids = paddle.to_tensor(np.array(phone_ids))
-        durations = paddle.to_tensor(np.array(durations))
-        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
-        # split data into 3 sections
-        if args.dataset == "baker":
-            num_train = 9800
-            num_dev = 100
-        if i in range(0, num_train):
-            sub_output_dir = output_dir / ("train/raw")
-        elif i in range(num_train, num_train + num_dev):
-            sub_output_dir = output_dir / ("dev/raw")
-        else:
-            sub_output_dir = output_dir / ("test/raw")
-        sub_output_dir.mkdir(parents=True, exist_ok=True)
-        with paddle.no_grad():
-            mel = fastspeech2_inference(phone_ids, durations=durations)
-        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
-def main():
-    # parse args and config and redirect to train_sp
-    parser = argparse.ArgumentParser(
-        description="Synthesize with fastspeech2 & parallel wavegan.")
-    parser.add_argument(
-        "--dataset",
-        default="baker",
-        type=str,
-        help="name of dataset, should in {baker, ljspeech, vctk} now")
-    parser.add_argument(
-        "--fastspeech2-config", type=str, help="fastspeech2 config file.")
-    parser.add_argument(
-        "--fastspeech2-checkpoint",
-        type=str,
-        help="fastspeech2 checkpoint to load.")
-    parser.add_argument(
-        "--fastspeech2-stat",
-        type=str,
-        help="mean and standard deviation used to normalize spectrogram when training fastspeech2."
-    )
-    parser.add_argument(
-        "--phones-dict",
-        type=str,
-        default="phone_id_map.txt",
-        help="phone vocabulary file.")
-    parser.add_argument(
-        "--dur-file", default=None, type=str, help="path to durations.txt.")
-    parser.add_argument("--output-dir", type=str, help="output dir.")
-    parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-    parser.add_argument("--verbose", type=int, default=1, help="verbose.")
-    def str2bool(str):
-        return True if str.lower() == 'true' else False
-    parser.add_argument(
-        "--cut-sil",
-        type=str2bool,
-        default=True,
-        help="whether cut sil in the edge of audio")
-    args = parser.parse_args()
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
-        paddle.set_device("gpu")
-    else:
-        print("ngpu should >= 0 !")
-    with open(args.fastspeech2_config) as f:
-        fastspeech2_config = CfgNode(yaml.safe_load(f))
-    print("========Args========")
-    print(yaml.safe_dump(vars(args)))
-    print("========Config========")
-    print(fastspeech2_config)
-    evaluate(args, fastspeech2_config)
-if __name__ == "__main__":
-    main()
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -16,9 +16,7 @@
 from typing import Dict
 from typing import Sequence
 from typing import Tuple
-from typing import Union
-import numpy as np
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
@@ -689,129 +687,6 @@ class FastSpeech2Inference(nn.Layer):
        return logmel
-class StyleFastSpeech2Inference(FastSpeech2Inference):
-    def __init__(self,
-                 normalizer,
-                 model,
-                 pitch_stats_path=None,
-                 energy_stats_path=None):
-        super().__init__(normalizer, model)
-        if pitch_stats_path:
-            pitch_mean, pitch_std = np.load(pitch_stats_path)
-            self.pitch_mean = paddle.to_tensor(pitch_mean)
-            self.pitch_std = paddle.to_tensor(pitch_std)
-        if energy_stats_path:
-            energy_mean, energy_std = np.load(energy_stats_path)
-            self.energy_mean = paddle.to_tensor(energy_mean)
-            self.energy_std = paddle.to_tensor(energy_std)
-    def denorm(self, data, mean, std):
-        return data * std + mean
-    def norm(self, data, mean, std):
-        return (data - mean) / std
-    def forward(self,
-                text: paddle.Tensor,
-                durations: Union[paddle.Tensor, np.ndarray]=None,
-                durations_scale: Union[int, float]=None,
-                durations_bias: Union[int, float]=None,
-                pitch: Union[paddle.Tensor, np.ndarray]=None,
-                pitch_scale: Union[int, float]=None,
-                pitch_bias: Union[int, float]=None,
-                energy: Union[paddle.Tensor, np.ndarray]=None,
-                energy_scale: Union[int, float]=None,
-                energy_bias: Union[int, float]=None,
-                robot: bool=False):
-        """
-        Parameters
-        ----------
-        text : Tensor(int64)
-            Input sequence of characters (T,).
-        speech : Tensor, optional
-            Feature sequence to extract style (N, idim).
-        durations : paddle.Tensor/np.ndarray, optional (int64)
-            Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
-        durations_scale: int/float, optional
-        durations_bias: int/float, optional
-        pitch : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged pitch (T, 1), this will overwrite the set of pitch_scale and pitch_bias
-        pitch_scale: int/float, optional
-            In denormed HZ domain.
-        pitch_bias: int/float, optional
-            In denormed HZ domain.
-        energy : paddle.Tensor/np.ndarray, optional
-            Groundtruth of token-averaged energy (T, 1), this will overwrite the set of energy_scale and energy_bias
-        energy_scale: int/float, optional
-            In denormed domain.
-        energy_bias: int/float, optional
-            In denormed domain.
-        robot : bool, optional
-            Weather output robot style
-        Returns
-        ----------
-        Tensor
-            Output sequence of features (L, odim).
-        """
-        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, durations=None, pitch=None, energy=None)
-        # priority: groundtruth > scale/bias > previous output
-        # set durations
-        if isinstance(durations, np.ndarray):
-            durations = paddle.to_tensor(durations)
-        elif isinstance(durations, paddle.Tensor):
-            durations = durations
-        elif durations_scale or durations_bias:
-            durations_scale = durations_scale if durations_scale is not None else 1
-            durations_bias = durations_bias if durations_bias is not None else 0
-            durations = durations_scale * d_outs + durations_bias
-        else:
-            durations = d_outs
-        if robot:
-            # set normed pitch to zeros have the same effect with set denormd ones to mean
-            pitch = paddle.zeros(p_outs.shape)
-        # set pitch, can overwrite robot set  
-        if isinstance(pitch, np.ndarray):
-            pitch = paddle.to_tensor(pitch)
-        elif isinstance(pitch, paddle.Tensor):
-            pitch = pitch
-        elif pitch_scale or pitch_bias:
-            pitch_scale = pitch_scale if pitch_scale is not None else 1
-            pitch_bias = pitch_bias if pitch_bias is not None else 0
-            p_Hz = paddle.exp(
-                self.denorm(p_outs, self.pitch_mean, self.pitch_std))
-            p_HZ = pitch_scale * p_Hz + pitch_bias
-            pitch = self.norm(paddle.log(p_HZ), self.pitch_mean, self.pitch_std)
-        else:
-            pitch = p_outs
-        # set energy
-        if isinstance(energy, np.ndarray):
-            energy = paddle.to_tensor(energy)
-        elif isinstance(energy, paddle.Tensor):
-            energy = energy
-        elif energy_scale or energy_bias:
-            energy_scale = energy_scale if energy_scale is not None else 1
-            energy_bias = energy_bias if energy_bias is not None else 0
-            e_dnorm = self.denorm(e_outs, self.energy_mean, self.energy_std)
-            e_dnorm = energy_scale * e_dnorm + energy_bias
-            energy = self.norm(e_dnorm, self.energy_mean, self.energy_std)
-        else:
-            energy = e_outs
-        normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text,
-            durations=durations,
-            pitch=pitch,
-            energy=energy,
-            use_teacher_forcing=True)
-        logmel = self.normalizer.inverse(normalized_mel)
-        return logmel
 class FastSpeech2Loss(nn.Layer):
    """Loss function module for FastSpeech2."""