diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index f755c88ece686a7bca7c7fc0d645b62e9b52cbc8..6f8a6f9c5563bffa4172912d8b06e05229012264 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -49,7 +49,7 @@ Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpe
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
-|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB|
+|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh
index 8263bc23489e97e09c79e478d026c0779e9d08c7..0a4cf69bb4595ce57e25d4deef461c55d69eee16 100755
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \
- --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
- --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
- --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh
index 6a7f093e89b6ba7cbb56157b67e7a64e41ee6950..d4744486ca634bd85b0381f7e715147b78400d6f 100755
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
- --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
- --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
- --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index f4f072e8469d82a89e71ebb3b6a5bc593a21fadc..22104a8f215f2c1eca29889778b98ac08575e193 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -152,22 +152,22 @@ TODO:
The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models
-The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip).
+The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip).
The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
-The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip)
+The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss
:-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------:
-default| 1(gpu) x 1000000| ——|—— |—— |—— | ——|
+default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777|
finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 |
Multi Band MelGAN checkpoint contains files listed below.
```text
-mb_melgan_baker_ckpt_0.5
+mb_melgan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train multi band melgan
├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan
└── snapshot_iter_1000000.pdz # generator parameters of multi band melgan
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 9097b6808ebc910f61d39073f13a3581e268c2b2..db1e8341f4f25cd871265b51b86cbbc7e68f112e 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -211,7 +211,6 @@ class ASRExecutor(BaseExecutor):
model_dict = paddle.load(self.ckpt_path)
self.model.set_state_dict(model_dict)
-
def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
"""
Input preprocess and return paddle.Tensor stored in self.input.
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index c934d5955cdbbfa474f181d285be9faf70159590..75470e89768f4168c37757c0f363f71fb3bf0d6a 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -168,13 +168,13 @@ pretrained_models = {
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip',
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
- 'b69322ab4ea766d955bd3d9af7dc5f2d',
+ 'ee5f0604e20091f0d495b6ec4618b90d',
'config':
- 'finetune.yaml',
+ 'default.yaml',
'ckpt':
- 'snapshot_iter_2000000.pdz',
+ 'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index 2d772bf3462edd6bf9d7ab85cecca2c49a999faf..526871a232d3241806377c16b459cfe42396b4df 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -54,6 +54,7 @@ def speedyspeech_single_spk_batch_fn(examples):
}
return batch
+
def speedyspeech_multi_spk_batch_fn(examples):
# fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
@@ -95,6 +96,7 @@ def speedyspeech_multi_spk_batch_fn(examples):
batch["spk_id"] = spk_id
return batch
+
def fastspeech2_single_spk_batch_fn(examples):
# fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"]
text = [np.array(item["text"], dtype=np.int64) for item in examples]
diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
index 6003d14004917f7c15a03973f51483527046383b..9ff771442e4ef16cd5e9b87df664be7a5306329c 100644
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -13,7 +13,6 @@
# limitations under the License.
import argparse
import re
-import os
from concurrent.futures import ThreadPoolExecutor
from operator import itemgetter
from pathlib import Path
@@ -32,8 +31,9 @@ from paddlespeech.t2s.data.get_feats import LogMelFBank
from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones
-from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+
def process_sentence(config: Dict[str, Any],
fp: Path,
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index cf3741a03718eaed2a9d10d1e2e9f238430377dd..448cd7bbf356f1e16dba5bc33464bc0910b2b65f 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -27,8 +27,8 @@ from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from yacs.config import CfgNode
-from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator
@@ -58,7 +58,9 @@ def train_sp(args, config):
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
- fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"]
+ fields = [
+ "phones", "tones", "num_phones", "num_frames", "feats", "durations"
+ ]
spk_num = None
if args.speaker_dict is not None:
@@ -137,7 +139,10 @@ def train_sp(args, config):
print("tone_size:", tone_size)
model = SpeedySpeech(
- vocab_size=vocab_size, tone_size=tone_size, spk_num=spk_num, **config["model"])
+ vocab_size=vocab_size,
+ tone_size=tone_size,
+ spk_num=spk_num,
+ **config["model"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index ed085dfd248cafd591f70ea325dee3d2c5e1f86c..107c5f1cc72a5272314e4a3af724a89f19a241df 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -14,7 +14,7 @@
import numpy as np
import paddle
from paddle import nn
-import paddle.nn.functional as F
+
from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
@@ -95,8 +95,13 @@ class TextEmbedding(nn.Layer):
class SpeedySpeechEncoder(nn.Layer):
- def __init__(self, vocab_size, tone_size, hidden_size, kernel_size,
- dilations, spk_num=None):
+ def __init__(self,
+ vocab_size,
+ tone_size,
+ hidden_size,
+ kernel_size,
+ dilations,
+ spk_num=None):
super().__init__()
self.embedding = TextEmbedding(
vocab_size,
@@ -104,7 +109,7 @@ class SpeedySpeechEncoder(nn.Layer):
tone_size,
padding_idx=0,
tone_padding_idx=0)
-
+
if spk_num:
self.spk_emb = nn.Embedding(
num_embeddings=spk_num,
@@ -112,7 +117,7 @@ class SpeedySpeechEncoder(nn.Layer):
padding_idx=0)
else:
self.spk_emb = None
-
+
self.prenet = nn.Sequential(
nn.Linear(hidden_size, hidden_size),
nn.ReLU(), )
@@ -171,19 +176,18 @@ class SpeedySpeechDecoder(nn.Layer):
class SpeedySpeech(nn.Layer):
- def __init__(
- self,
- vocab_size,
- encoder_hidden_size,
- encoder_kernel_size,
- encoder_dilations,
- duration_predictor_hidden_size,
- decoder_hidden_size,
- decoder_output_size,
- decoder_kernel_size,
- decoder_dilations,
- tone_size=None,
- spk_num=None):
+ def __init__(self,
+ vocab_size,
+ encoder_hidden_size,
+ encoder_kernel_size,
+ encoder_dilations,
+ duration_predictor_hidden_size,
+ decoder_hidden_size,
+ decoder_output_size,
+ decoder_kernel_size,
+ decoder_dilations,
+ tone_size=None,
+ spk_num=None):
super().__init__()
encoder = SpeedySpeechEncoder(vocab_size, tone_size,
encoder_hidden_size, encoder_kernel_size,
@@ -255,6 +259,7 @@ class SpeedySpeech(nn.Layer):
decoded = self.decoder(encodings)
return decoded[0]
+
class SpeedySpeechInference(nn.Layer):
def __init__(self, normalizer, speedyspeech_model):
super().__init__()
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
index 6b94ff9b9b5bf056f08f35b9ac07642fbc1c1a51..ee45cdc85dc6cd0c078f6699468aaa442c79a38d 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
@@ -57,8 +57,7 @@ class SpeedySpeechUpdater(StandardUpdater):
text=batch["phones"],
tones=batch["tones"],
durations=batch["durations"],
- spk_id=spk_id
- )
+ spk_id=spk_id)
target_mel = batch["feats"]
spec_mask = F.sequence_mask(
@@ -123,8 +122,7 @@ class SpeedySpeechEvaluator(StandardEvaluator):
text=batch["phones"],
tones=batch["tones"],
durations=batch["durations"],
- spk_id=spk_id
- )
+ spk_id=spk_id)
target_mel = batch["feats"]
spec_mask = F.sequence_mask(