From 680eac02b9629be6f3bcb6fefc3fdb991050809d Mon Sep 17 00:00:00 2001 From: TianYuan Date: Wed, 5 Jan 2022 17:17:40 +0800 Subject: [PATCH] [tts]Update mb melgan (#1272) * update mb melgan * update mb melgan, test=tts --- docs/source/released_model.md | 2 +- examples/csmsc/tts2/local/synthesize_e2e.sh | 6 +-- examples/csmsc/tts3/local/synthesize_e2e.sh | 6 +-- examples/csmsc/voc3/README.md | 8 ++-- paddlespeech/cli/asr/infer.py | 1 - paddlespeech/cli/tts/infer.py | 8 ++-- paddlespeech/t2s/datasets/am_batch_fn.py | 2 + .../t2s/exps/speedyspeech/preprocess.py | 4 +- paddlespeech/t2s/exps/speedyspeech/train.py | 11 +++-- .../t2s/models/speedyspeech/speedyspeech.py | 41 +++++++++++-------- .../speedyspeech/speedyspeech_updater.py | 6 +-- 11 files changed, 52 insertions(+), 43 deletions(-) diff --git a/docs/source/released_model.md b/docs/source/released_model.md index f755c88e..6f8a6f9c 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -49,7 +49,7 @@ Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpe Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 8263bc23..0a4cf69b 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 6a7f093e..d4744486 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index f4f072e8..22104a8f 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -152,22 +152,22 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). +The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip). The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). -The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) +The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: -default| 1(gpu) x 1000000| ——|—— |—— |—— | ——| +default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777| finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 | Multi Band MelGAN checkpoint contains files listed below. ```text -mb_melgan_baker_ckpt_0.5 +mb_melgan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train multi band melgan ├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan └── snapshot_iter_1000000.pdz # generator parameters of multi band melgan diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 9097b680..db1e8341 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -211,7 +211,6 @@ class ASRExecutor(BaseExecutor): model_dict = paddle.load(self.ckpt_path) self.model.set_state_dict(model_dict) - def preprocess(self, model_type: str, input: Union[str, os.PathLike]): """ Input preprocess and return paddle.Tensor stored in self.input. diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index c934d595..75470e89 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -168,13 +168,13 @@ pretrained_models = { # mb_melgan "mb_melgan_csmsc-zh": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', 'md5': - 'b69322ab4ea766d955bd3d9af7dc5f2d', + 'ee5f0604e20091f0d495b6ec4618b90d', 'config': - 'finetune.yaml', + 'default.yaml', 'ckpt': - 'snapshot_iter_2000000.pdz', + 'snapshot_iter_1000000.pdz', 'speech_stats': 'feats_stats.npy', }, diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 2d772bf3..526871a2 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -54,6 +54,7 @@ def speedyspeech_single_spk_batch_fn(examples): } return batch + def speedyspeech_multi_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] @@ -95,6 +96,7 @@ def speedyspeech_multi_spk_batch_fn(examples): batch["spk_id"] = spk_id return batch + def fastspeech2_single_spk_batch_fn(examples): # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"] text = [np.array(item["text"], dtype=np.int64) for item in examples] diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index 6003d140..9ff77144 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse import re -import os from concurrent.futures import ThreadPoolExecutor from operator import itemgetter from pathlib import Path @@ -32,8 +31,9 @@ from paddlespeech.t2s.data.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones -from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence + def process_sentence(config: Dict[str, Any], fp: Path, diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index cf3741a0..448cd7bb 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -27,8 +27,8 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from yacs.config import CfgNode -from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator @@ -58,7 +58,9 @@ def train_sp(args, config): f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) - fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + fields = [ + "phones", "tones", "num_phones", "num_frames", "feats", "durations" + ] spk_num = None if args.speaker_dict is not None: @@ -137,7 +139,10 @@ def train_sp(args, config): print("tone_size:", tone_size) model = SpeedySpeech( - vocab_size=vocab_size, tone_size=tone_size, spk_num=spk_num, **config["model"]) + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index ed085dfd..107c5f1c 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -14,7 +14,7 @@ import numpy as np import paddle from paddle import nn -import paddle.nn.functional as F + from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding @@ -95,8 +95,13 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): - def __init__(self, vocab_size, tone_size, hidden_size, kernel_size, - dilations, spk_num=None): + def __init__(self, + vocab_size, + tone_size, + hidden_size, + kernel_size, + dilations, + spk_num=None): super().__init__() self.embedding = TextEmbedding( vocab_size, @@ -104,7 +109,7 @@ class SpeedySpeechEncoder(nn.Layer): tone_size, padding_idx=0, tone_padding_idx=0) - + if spk_num: self.spk_emb = nn.Embedding( num_embeddings=spk_num, @@ -112,7 +117,7 @@ class SpeedySpeechEncoder(nn.Layer): padding_idx=0) else: self.spk_emb = None - + self.prenet = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.ReLU(), ) @@ -171,19 +176,18 @@ class SpeedySpeechDecoder(nn.Layer): class SpeedySpeech(nn.Layer): - def __init__( - self, - vocab_size, - encoder_hidden_size, - encoder_kernel_size, - encoder_dilations, - duration_predictor_hidden_size, - decoder_hidden_size, - decoder_output_size, - decoder_kernel_size, - decoder_dilations, - tone_size=None, - spk_num=None): + def __init__(self, + vocab_size, + encoder_hidden_size, + encoder_kernel_size, + encoder_dilations, + duration_predictor_hidden_size, + decoder_hidden_size, + decoder_output_size, + decoder_kernel_size, + decoder_dilations, + tone_size=None, + spk_num=None): super().__init__() encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, @@ -255,6 +259,7 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded[0] + class SpeedySpeechInference(nn.Layer): def __init__(self, normalizer, speedyspeech_model): super().__init__() diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index 6b94ff9b..ee45cdc8 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -57,8 +57,7 @@ class SpeedySpeechUpdater(StandardUpdater): text=batch["phones"], tones=batch["tones"], durations=batch["durations"], - spk_id=spk_id - ) + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( @@ -123,8 +122,7 @@ class SpeedySpeechEvaluator(StandardEvaluator): text=batch["phones"], tones=batch["tones"], durations=batch["durations"], - spk_id=spk_id - ) + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( -- GitLab