diff --git a/docs/source/released_model.md b/docs/source/released_model.md index f755c88ece686a7bca7c7fc0d645b62e9b52cbc8..6f8a6f9c5563bffa4172912d8b06e05229012264 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -49,7 +49,7 @@ Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpe Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 8263bc23489e97e09c79e478d026c0779e9d08c7..0a4cf69bb4595ce57e25d4deef461c55d69eee16 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 6a7f093e89b6ba7cbb56157b67e7a64e41ee6950..d4744486ca634bd85b0381f7e715147b78400d6f 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index f4f072e8469d82a89e71ebb3b6a5bc593a21fadc..22104a8f215f2c1eca29889778b98ac08575e193 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -152,22 +152,22 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). +The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip). The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). -The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) +The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: -default| 1(gpu) x 1000000| ——|—— |—— |—— | ——| +default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777| finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 | Multi Band MelGAN checkpoint contains files listed below. ```text -mb_melgan_baker_ckpt_0.5 +mb_melgan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train multi band melgan ├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan └── snapshot_iter_1000000.pdz # generator parameters of multi band melgan diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 9097b6808ebc910f61d39073f13a3581e268c2b2..db1e8341f4f25cd871265b51b86cbbc7e68f112e 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -211,7 +211,6 @@ class ASRExecutor(BaseExecutor): model_dict = paddle.load(self.ckpt_path) self.model.set_state_dict(model_dict) - def preprocess(self, model_type: str, input: Union[str, os.PathLike]): """ Input preprocess and return paddle.Tensor stored in self.input. diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index c934d5955cdbbfa474f181d285be9faf70159590..75470e89768f4168c37757c0f363f71fb3bf0d6a 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -168,13 +168,13 @@ pretrained_models = { # mb_melgan "mb_melgan_csmsc-zh": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', 'md5': - 'b69322ab4ea766d955bd3d9af7dc5f2d', + 'ee5f0604e20091f0d495b6ec4618b90d', 'config': - 'finetune.yaml', + 'default.yaml', 'ckpt': - 'snapshot_iter_2000000.pdz', + 'snapshot_iter_1000000.pdz', 'speech_stats': 'feats_stats.npy', }, diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 2d772bf3462edd6bf9d7ab85cecca2c49a999faf..526871a232d3241806377c16b459cfe42396b4df 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -54,6 +54,7 @@ def speedyspeech_single_spk_batch_fn(examples): } return batch + def speedyspeech_multi_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] @@ -95,6 +96,7 @@ def speedyspeech_multi_spk_batch_fn(examples): batch["spk_id"] = spk_id return batch + def fastspeech2_single_spk_batch_fn(examples): # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"] text = [np.array(item["text"], dtype=np.int64) for item in examples] diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index 6003d14004917f7c15a03973f51483527046383b..9ff771442e4ef16cd5e9b87df664be7a5306329c 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -13,7 +13,6 @@ # limitations under the License. import argparse import re -import os from concurrent.futures import ThreadPoolExecutor from operator import itemgetter from pathlib import Path @@ -32,8 +31,9 @@ from paddlespeech.t2s.data.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones -from paddlespeech.t2s.datasets.preprocess_utils import merge_silence from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence + def process_sentence(config: Dict[str, Any], fp: Path, diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index cf3741a03718eaed2a9d10d1e2e9f238430377dd..448cd7bbf356f1e16dba5bc33464bc0910b2b65f 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -27,8 +27,8 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from yacs.config import CfgNode -from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator @@ -58,7 +58,9 @@ def train_sp(args, config): f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) - fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + fields = [ + "phones", "tones", "num_phones", "num_frames", "feats", "durations" + ] spk_num = None if args.speaker_dict is not None: @@ -137,7 +139,10 @@ def train_sp(args, config): print("tone_size:", tone_size) model = SpeedySpeech( - vocab_size=vocab_size, tone_size=tone_size, spk_num=spk_num, **config["model"]) + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index ed085dfd248cafd591f70ea325dee3d2c5e1f86c..107c5f1cc72a5272314e4a3af724a89f19a241df 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -14,7 +14,7 @@ import numpy as np import paddle from paddle import nn -import paddle.nn.functional as F + from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding @@ -95,8 +95,13 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): - def __init__(self, vocab_size, tone_size, hidden_size, kernel_size, - dilations, spk_num=None): + def __init__(self, + vocab_size, + tone_size, + hidden_size, + kernel_size, + dilations, + spk_num=None): super().__init__() self.embedding = TextEmbedding( vocab_size, @@ -104,7 +109,7 @@ class SpeedySpeechEncoder(nn.Layer): tone_size, padding_idx=0, tone_padding_idx=0) - + if spk_num: self.spk_emb = nn.Embedding( num_embeddings=spk_num, @@ -112,7 +117,7 @@ class SpeedySpeechEncoder(nn.Layer): padding_idx=0) else: self.spk_emb = None - + self.prenet = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.ReLU(), ) @@ -171,19 +176,18 @@ class SpeedySpeechDecoder(nn.Layer): class SpeedySpeech(nn.Layer): - def __init__( - self, - vocab_size, - encoder_hidden_size, - encoder_kernel_size, - encoder_dilations, - duration_predictor_hidden_size, - decoder_hidden_size, - decoder_output_size, - decoder_kernel_size, - decoder_dilations, - tone_size=None, - spk_num=None): + def __init__(self, + vocab_size, + encoder_hidden_size, + encoder_kernel_size, + encoder_dilations, + duration_predictor_hidden_size, + decoder_hidden_size, + decoder_output_size, + decoder_kernel_size, + decoder_dilations, + tone_size=None, + spk_num=None): super().__init__() encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, @@ -255,6 +259,7 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded[0] + class SpeedySpeechInference(nn.Layer): def __init__(self, normalizer, speedyspeech_model): super().__init__() diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index 6b94ff9b9b5bf056f08f35b9ac07642fbc1c1a51..ee45cdc85dc6cd0c078f6699468aaa442c79a38d 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -57,8 +57,7 @@ class SpeedySpeechUpdater(StandardUpdater): text=batch["phones"], tones=batch["tones"], durations=batch["durations"], - spk_id=spk_id - ) + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( @@ -123,8 +122,7 @@ class SpeedySpeechEvaluator(StandardEvaluator): text=batch["phones"], tones=batch["tones"], durations=batch["durations"], - spk_id=spk_id - ) + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask(