diff --git a/deepspeech/__init__.py b/deepspeech/__init__.py index ed209f3deadef3f540fc2c0710ab538e199182cf..5762e6359404580ca229d4218afe6b08851371d5 100644 --- a/deepspeech/__init__.py +++ b/deepspeech/__init__.py @@ -355,7 +355,6 @@ if not hasattr(paddle.Tensor, 'tolist'): "register user tolist to paddle.Tensor, remove this when fixed!") setattr(paddle.Tensor, 'tolist', tolist) - ########### hack paddle.nn ############# from paddle.nn import Layer from typing import Optional @@ -506,5 +505,3 @@ if not hasattr(paddle.nn, 'LayerDict'): logger.debug( "register user LayerDict to paddle.nn, remove this when fixed!") setattr(paddle.nn, 'LayerDict', LayerDict) - - diff --git a/deepspeech/decoders/recog.py b/deepspeech/decoders/recog.py index c8df65d6838c097ee78fef46622c1a9f14aa09c3..6dea6b701ac5832a70f817a5906e2f799f51ebd6 100644 --- a/deepspeech/decoders/recog.py +++ b/deepspeech/decoders/recog.py @@ -12,12 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`.""" -import json -from pathlib import Path - import jsonlines import paddle -import yaml from yacs.config import CfgNode from .beam_search import BatchBeamSearch @@ -79,8 +75,7 @@ def recog_v2(args): sort_in_input_length=False, preprocess_conf=confs.collator.augmentation_config if args.preprocess_conf is None else args.preprocess_conf, - preprocess_args={"train": False}, - ) + preprocess_args={"train": False}, ) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) @@ -113,8 +108,7 @@ def recog_v2(args): ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, - length_bonus=args.penalty, - ) + length_bonus=args.penalty, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(char_list), @@ -123,8 +117,7 @@ def recog_v2(args): sos=model.sos, eos=model.eos, token_list=char_list, - pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", - ) + pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: @@ -171,9 +164,10 @@ def recog_v2(args): logger.info(f'feat: {feat.shape}') enc = model.encode(paddle.to_tensor(feat).to(dtype)) logger.info(f'eout: {enc.shape}') - nbest_hyps = beam_search(x=enc, - maxlenratio=args.maxlenratio, - minlenratio=args.minlenratio) + nbest_hyps = beam_search( + x=enc, + maxlenratio=args.maxlenratio, + minlenratio=args.minlenratio) nbest_hyps = [ h.asdict() for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] @@ -183,9 +177,8 @@ def recog_v2(args): item = new_js[name]['output'][0] # 1-best ref = item['text'] - rec_text = item['rec_text'].replace('▁', - ' ').replace('', - '').strip() + rec_text = item['rec_text'].replace('▁', ' ').replace( + '', '').strip() rec_tokenid = list(map(int, item['rec_tokenid'].split())) f.write({ "utt": name, diff --git a/deepspeech/decoders/recog_bin.py b/deepspeech/decoders/recog_bin.py index 567dfecde9cb470b65e1bdbb4d9c0dbad87b83de..fbf582f7fdf8943e3c317fc987a123e3ac8bb3b1 100644 --- a/deepspeech/decoders/recog_bin.py +++ b/deepspeech/decoders/recog_bin.py @@ -21,7 +21,7 @@ from distutils.util import strtobool import configargparse import numpy as np -from .recog import recog_v2 +from deepspeech.decoders.recog import recog_v2 def get_parser(): @@ -359,7 +359,7 @@ def main(args): if args.num_encs == 1: # Experimental API that supports custom LMs if args.api == "v2": - from deepspeech.decoders.recog import recog_v2 + recog_v2(args) else: raise ValueError("Only support --api v2") diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py index e0c8006d17272dde54ac948b9f112fa1d67ba92a..df6848db0b91b7dce1a140c8f9c6a57e2810ef2b 100644 --- a/deepspeech/modules/ctc.py +++ b/deepspeech/modules/ctc.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Union + import paddle from paddle import nn -from typing import Union from paddle.nn import functional as F from typeguard import check_argument_types diff --git a/examples/aishell3/README.md b/examples/aishell3/README.md index 2111901df5ddfee4377c2e1fbe21cece562bc708..b52950c47cff51d956c981b01af808a59734fb96 100644 --- a/examples/aishell3/README.md +++ b/examples/aishell3/README.md @@ -1,4 +1,11 @@ # Aishell3 -* tts0 - fastspeech2 -* vc0 - tactron2 voice clone +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN +* vc0 - Tactron2 Voice Clone with GE2E diff --git a/examples/aishell3/tts0/run.sh b/examples/aishell3/tts0/run.sh deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/examples/vctk/fastspeech2/aishell3/README.md b/examples/aishell3/tts3/README.md similarity index 84% rename from examples/vctk/fastspeech2/aishell3/README.md rename to examples/aishell3/tts3/README.md index c562428567587dc2654208f57a1a1fa2e92faa7f..130c52e13a4a80d28478188ca8bdee754c110638 100644 --- a/examples/vctk/fastspeech2/aishell3/README.md +++ b/examples/aishell3/tts3/README.md @@ -18,12 +18,23 @@ tar zxvf data_aishell3.tgz -C data_aishell3 ### Get MFA result of AISHELL-3 and Extract it We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2. You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo. -### Preprocess the dataset + +## Get Started Assume the path to the dataset is `~/datasets/data_aishell3`. Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. ```text @@ -47,10 +58,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. ```text @@ -85,20 +96,8 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. 7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2. -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) -FastSpeech2 checkpoint contains files listed below. - -```text -fastspeech2_nosil_aishell3_ckpt_0.4 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_96400.pdz # model parameters and optimizer states -├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2 -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash @@ -111,9 +110,9 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -153,22 +152,22 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` - -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. - +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] [--text TEXT] - [--output-dir OUTPUT_DIR] [--device DEVICE] - [--verbose VERBOSE] +usage: multi_spk_synthesize_e2e.py [-h] + [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] + [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--text TEXT] + [--output-dir OUTPUT_DIR] [--device DEVICE] + [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -204,24 +203,38 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip) + +FastSpeech2 checkpoint contains files listed below. + +```text +fastspeech2_nosil_aishell3_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_96400.pdz # model parameters and optimizer states +├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2 +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. ```bash +source path.sh + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \ --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \ --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt ``` - ## Future work A multi-speaker vocoder is needed. diff --git a/examples/vctk/fastspeech2/aishell3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/aishell3/conf/default.yaml rename to examples/aishell3/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/aishell3/preprocess.sh b/examples/aishell3/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/aishell3/preprocess.sh rename to examples/aishell3/tts3/local/preprocess.sh index 281abee0c6311fe273dbaab3257ac4475f8735da..a40ee96d7d2bdadd81ca7d00d9eca2f88173c8d6 100755 --- a/examples/vctk/fastspeech2/aishell3/preprocess.sh +++ b/examples/aishell3/tts3/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./aishell3_alignment_tone \ --output durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=aishell3 \ --rootdir=~/datasets/data_aishell3/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/fastspeech2/vctk/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh similarity index 65% rename from examples/vctk/fastspeech2/vctk/synthesize.sh rename to examples/aishell3/tts3/local/synthesize.sh index 329fc9e1e38a21c2a2501f27f0ebece986d91edf..64361983d335197968dd048a9615dd7ad03aebd7 100755 --- a/examples/vctk/fastspeech2/vctk/synthesize.sh +++ b/examples/aishell3/tts3/local/synthesize.sh @@ -1,15 +1,20 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak\ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh similarity index 58% rename from examples/vctk/fastspeech2/vctk/synthesize_e2e.sh rename to examples/aishell3/tts3/local/synthesize_e2e.sh index 446e3363c6e0caa9b4fdbb529f1aadc2a487948e..8a979844b0567b7326990680c2d087642ff5c05c 100755 --- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.sh +++ b/examples/aishell3/tts3/local/synthesize_e2e.sh @@ -1,15 +1,20 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences_en.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/aishell3/run.sh b/examples/aishell3/tts3/local/train.sh similarity index 61% rename from examples/vctk/fastspeech2/aishell3/run.sh rename to examples/aishell3/tts3/local/train.sh index d4f06da9180fe5a4ba5500040735910d35b8ca77..be6051c979d0ff319be06ca0899385b89e286fcf 100755 --- a/examples/vctk/fastspeech2/aishell3/run.sh +++ b/examples/aishell3/tts3/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..561d01632ba4b63b582ef07e460d31842823f07c --- /dev/null +++ b/examples/aishell3/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..656710763aa578716d16f77c69c84ea402b7b9e9 --- /dev/null +++ b/examples/aishell3/tts3/run.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_482.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d5803f644ee34538623efe158a5f00a60f417fbb --- /dev/null +++ b/examples/aishell3/vc0/README.md @@ -0,0 +1,89 @@ +# Tacotron2 + AISHELL-3 Voice Cloning +This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows: +1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](../../other/ge2e). +2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs. +3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](../../ljspeech/voc0). + +## Get Started +Assume the path to the dataset is `~/datasets/data_aishell3`. +Assume the path to the MFA result of AISHELL-3 is `./alignment`. +Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000` +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. start a voice cloning inference. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} +``` +#### generate utterance embedding + Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`. + +```bash +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../ge2e/inference.py \ + --input=${input} \ + --output=${preprocess_path}/embed \ + --device="gpu" \ + --checkpoint_path=${ge2e_ckpt_path} +fi +``` + +The computing time of utterance embedding can be x hours. +#### process wav +There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence. + +We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`. + +We use [lexicon.txt](./lexicon.txt) as the lexicon. + +You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo. + +```bash +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Process wav ..." + python3 ${BIN_DIR}/process_wav.py \ + --input=${input}/wav \ + --output=${preprocess_path}/normalized_wav \ + --alignment=${alignment} +fi +``` + +#### preprocess transcription +We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels. + +```bash +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/preprocess_transcription.py \ + --input=${input} \ + --output=${preprocess_path} +fi +``` +The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading. +#### extract mel +```python +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/extract_mel.py \ + --input=${preprocess_path}/normalized_wav \ + --output=${preprocess_path}/mel +fi +``` + +### Train the model +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} +``` + +Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition. + +In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster. +### Infernece +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} +``` +## Pretrained Model +[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). diff --git a/examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png b/examples/aishell3/vc0/images/alignment-step2000.png similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/images/alignment-step2000.png rename to examples/aishell3/vc0/images/alignment-step2000.png diff --git a/examples/aishell3/vc0/local/tacotron2/images/train.png b/examples/aishell3/vc0/images/train.png similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/images/train.png rename to examples/aishell3/vc0/images/train.png diff --git a/examples/aishell3/vc0/local/tacotron2/images/valid.png b/examples/aishell3/vc0/images/valid.png similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/images/valid.png rename to examples/aishell3/vc0/images/valid.png diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh new file mode 100755 index 0000000000000000000000000000000000000000..87cfab32a1d90f798b0df155872f8bbac598bf72 --- /dev/null +++ b/examples/aishell3/vc0/local/preprocess.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +input=$1 +preprocess_path=$2 +alignment=$3 +ge2e_ckpt_path=$4 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../../ge2e/inference.py \ + --input=${input} \ + --output=${preprocess_path}/embed \ + --device="gpu" \ + --checkpoint_path=${ge2e_ckpt_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Process wav ..." + python3 ${BIN_DIR}/process_wav.py \ + --input=${input}/wav \ + --output=${preprocess_path}/normalized_wav \ + --alignment=${alignment} +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/preprocess_transcription.py \ + --input=${input} \ + --output=${preprocess_path} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + python3 ${BIN_DIR}/extract_mel.py \ + --input=${preprocess_path}/normalized_wav \ + --output=${preprocess_path}/mel +fi diff --git a/examples/aishell3/vc0/local/tacotron2/README_cn.md b/examples/aishell3/vc0/local/tacotron2/README_cn.md deleted file mode 100644 index a364994a696daa14f637c79fd15c49c1c0b31ddb..0000000000000000000000000000000000000000 --- a/examples/aishell3/vc0/local/tacotron2/README_cn.md +++ /dev/null @@ -1,112 +0,0 @@ -## Tacotron2 + AISHELL-3 数据集训练语音克隆模型 - -本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务,使用的模型大体结构和论文 [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下: - -1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同,因为不需要 transcription 的缘故,我们使用了较多的训练数据,可以参考实现 [ge2e](../ge2e)。 -2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。 -3. Vocoder: 我们使用的声码器是 WaveFlow,参考实验 [waveflow](../waveflow). - -## 数据处理 - -### utterance embedding 的生成 - -使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。 - -首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip),然后运行脚本生成每个句子的 utterance embedding. - -```bash -python inference.py --input= --output= --device="gpu" --checkpoint_path= -``` - -其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`,然后 output 是用于存储 utterance embed 的文件夹,这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储,格式为 `.npy`. - -utterance embedding 的计算可能会用几个小时的时间,请耐心等待。 - -### 音频处理 - -因为 AISHELL-3 数据集前后有一些空白,静音片段,而且语音幅值很小,所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法,但是效果不是很好,对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。 - -我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注,所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$` 和 `%`)去除,并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件,扩展名为 `.lab`. - -此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分,而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。 - -准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令,即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径,词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。 - -```bash -./mfa_train_and_align \ - ~/datasets/aishell3/train/wav \ - lexicon.txt \ - ~/datasets/aishell3/train/alignment \ - -o aishell3_model \ - -v -``` - -因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz),其中每个句子对应的文件为 `.TextGrid` 格式的文本。 - -得到了对齐文件之后,可以运行 `process_wav.py` 脚本来处理音频。 - -```bash -python process_wav.py --input= --output= --alignment= -``` - -默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`. - -处理结束后,会将处理好的音频保存在 `` 文件夹中。 - -### 转录文本处理 - -把文本转换成为 phone 和 tone 的形式,并存储起来。值得注意的是,这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式,当然也可以只做声母和韵母的切分。 - -运行脚本处理转录文本。 - -```bash -python preprocess_transcription.py --input= --output= -``` - -默认的 input 是 `~/datasets/aishell3/train`,其中会包含 `label_train-set.txt` 文件,处理后的结果会 `metadata.yaml` 和 `metadata.pickle`. 前者是文本格式,方便查看,后者是二进制格式,方便直接读取。 - -### mel 频谱提取 - -对处理后的音频进行 mel 频谱的提取,并且以和音频文件夹同构的方式存储,存储格式是 `.npy` 文件。 - -```python -python extract_mel.py --input= --output= -``` - -input 是处理后的音频所在的文件夹,output 是输出频谱的文件夹。 - -## 训练 - -运行脚本训练。 - -```python -python train.py --data= --output= --device="gpu" -``` - -我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。 - -另外,为了加速模型的收敛,我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。 - -可以使用 visualdl 查看训练过程的 log。 - -```bash -visualdl --logdir= --host=$HOSTNAME -``` - -示例 training loss / validation loss 曲线如下。 - -![train](./images/train.png) - -![valid](./images/valid.png) - -alignment-step2000 - -大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加,对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的,所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线,需要更长的时间。 - -## 预训练模型 - -预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip). - -## 使用 - -本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。 diff --git a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb b/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb deleted file mode 100644 index fc4705fc6219b17374cea478ab76a94dd2f8d03f..0000000000000000000000000000000000000000 --- a/examples/aishell3/vc0/local/tacotron2/voice_cloning.ipynb +++ /dev/null @@ -1,383 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import paddle\n", - "from matplotlib import pyplot as plt\n", - "from IPython import display as ipd\n", - "import soundfile as sf\n", - "import librosa.display\n", - "from parakeet.utils import display\n", - "paddle.set_device(\"gpu:0\")\n", - "import sys\n", - "sys.path.append(\"../../\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 加载模型" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "vocab_phones:\n", - " Vocab(size: 68,\n", - "stoi:\n", - "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('$', 4), ('%', 5), ('&r', 6), ('a', 7), ('ai', 8), ('an', 9), ('ang', 10), ('ao', 11), ('b', 12), ('c', 13), ('ch', 14), ('d', 15), ('e', 16), ('ea', 17), ('ei', 18), ('en', 19), ('eng', 20), ('er', 21), ('f', 22), ('g', 23), ('h', 24), ('i', 25), ('ia', 26), ('iai', 27), ('ian', 28), ('iang', 29), ('iao', 30), ('ie', 31), ('ien', 32), ('ieng', 33), ('ii', 34), ('iii', 35), ('io', 36), ('iou', 37), ('j', 38), ('k', 39), ('l', 40), ('m', 41), ('n', 42), ('o', 43), ('ou', 44), ('p', 45), ('q', 46), ('r', 47), ('s', 48), ('sh', 49), ('t', 50), ('u', 51), ('ua', 52), ('uai', 53), ('uan', 54), ('uang', 55), ('uei', 56), ('uen', 57), ('ueng', 58), ('uo', 59), ('v', 60), ('van', 61), ('ve', 62), ('ven', 63), ('veng', 64), ('x', 65), ('z', 66), ('zh', 67)]))\n", - "vocab_tones:\n", - " Vocab(size: 10,\n", - "stoi:\n", - "OrderedDict([('', 0), ('', 1), ('', 2), ('', 3), ('0', 4), ('1', 5), ('2', 6), ('3', 7), ('4', 8), ('5', 9)]))\n" - ] - } - ], - "source": [ - "from examples.ge2e.audio_processor import SpeakerVerificationPreprocessor\n", - "from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder\n", - "\n", - "# speaker encoder\n", - "p = SpeakerVerificationPreprocessor(\n", - " sampling_rate=16000, \n", - " audio_norm_target_dBFS=-30, \n", - " vad_window_length=30, \n", - " vad_moving_average_width=8, \n", - " vad_max_silence_length=6, \n", - " mel_window_length=25, \n", - " mel_window_step=10, \n", - " n_mels=40, \n", - " partial_n_frames=160, \n", - " min_pad_coverage=0.75, \n", - " partial_overlap_ratio=0.5)\n", - "speaker_encoder = LSTMSpeakerEncoder(n_mels=40, num_layers=3, hidden_size=256, output_size=256)\n", - "speaker_encoder_params_path = \"../../pretrained/ge2e/ge2e_ckpt_0.3/step-3000000.pdparams\"\n", - "speaker_encoder.set_state_dict(paddle.load(speaker_encoder_params_path))\n", - "speaker_encoder.eval()\n", - "\n", - "# synthesizer\n", - "from parakeet.models.tacotron2 import Tacotron2\n", - "from examples.tacotron2_aishell3.chinese_g2p import convert_sentence\n", - "from examples.tacotron2_aishell3.aishell3 import voc_phones, voc_tones\n", - "\n", - "synthesizer = Tacotron2(\n", - " vocab_size=68,\n", - " n_tones=10,\n", - " d_mels= 80,\n", - " d_encoder= 512,\n", - " encoder_conv_layers = 3,\n", - " encoder_kernel_size= 5,\n", - " d_prenet= 256,\n", - " d_attention_rnn= 1024,\n", - " d_decoder_rnn = 1024,\n", - " attention_filters = 32,\n", - " attention_kernel_size = 31,\n", - " d_attention= 128,\n", - " d_postnet = 512,\n", - " postnet_kernel_size = 5,\n", - " postnet_conv_layers = 5,\n", - " reduction_factor = 1,\n", - " p_encoder_dropout = 0.5,\n", - " p_prenet_dropout= 0.5,\n", - " p_attention_dropout= 0.1,\n", - " p_decoder_dropout= 0.1,\n", - " p_postnet_dropout= 0.5,\n", - " d_global_condition=256,\n", - " use_stop_token=False,\n", - ")\n", - "params_path = \"../../pretrained/tacotron2_aishell3/tacotron2_aishell3_ckpt_0.3/step-450000.pdparams\"\n", - "synthesizer.set_state_dict(paddle.load(params_path))\n", - "synthesizer.eval()\n", - "\n", - "# vocoder\n", - "from parakeet.models import ConditionalWaveFlow\n", - "vocoder = ConditionalWaveFlow(upsample_factors=[16, 16], n_flows=8, n_layers=8, n_group=16, channels=128, n_mels=80, kernel_size=[3, 3])\n", - "params_path = \"../../pretrained/waveflow/waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams\"\n", - "vocoder.set_state_dict(paddle.load(params_path))\n", - "vocoder.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 生成 speaker encoding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "首先在当前文件夹下新建文件夹 `ref_audio`,把要作为参考的音频存在在这个文件夹中。格式要求是 wav 格式,采样率会被重采样至 16kHz." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ref_name = \"女声2.wav\"\n", - "ref_audio_path = f\"./ref_audio/{ref_name}\"\n", - "ipd.Audio(ref_audio_path, normalize=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mel_sequences: (2, 160, 40)\n", - "embed shape: [256]\n" - ] - } - ], - "source": [ - "mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))\n", - "print(\"mel_sequences: \", mel_sequences.shape)\n", - "with paddle.no_grad():\n", - " embed = speaker_encoder.embed_utterance(paddle.to_tensor(mel_sequences))\n", - "print(\"embed shape: \", embed.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 合成频谱" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "因为 AISHELL-3 数据集中使用 `%` 和 `$` 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 `%` 和 `$` 来调节韵律。\n", - "\n", - "值得的注意的是,句子的有效字符集仅包含汉字和 `%`, `$`, 因此输入的句子只能包含这些字符。" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['m', 'ei', 'd', 'ang', 'n', 'i', 'j', 've', 'd', 'e', '%', 'x', 'iang', 'iao', 'p', 'i', 'p', 'ieng', 'sh', 'en', 'm', 'e', 'r', 'en', 'd', 'e', 'sh', 'iii', 'h', 'ou', '$', 'n', 'i', 'q', 'ie', 'iao', 'j', 'i', 'zh', 'e', '%', 'zh', 'e', 'g', 'e', 'sh', 'iii', 'j', 'ie', 'sh', 'ang', 'd', 'e', 'r', 'en', '%', 'b', 'ieng', 'f', 'ei', 'd', 'ou', 'j', 'v', 'b', 'ei', 'n', 'i', 'b', 'ieng', 'iou', 'd', 'e', 't', 'iao', 'j', 'ian', '$']\n", - "['0', '3', '0', '1', '0', '3', '0', '2', '0', '5', '0', '0', '3', '4', '0', '1', '0', '2', '0', '2', '0', '5', '0', '2', '0', '5', '0', '2', '0', '4', '0', '0', '3', '0', '4', '4', '0', '4', '0', '5', '0', '0', '4', '0', '4', '0', '4', '0', '4', '0', '4', '0', '5', '0', '2', '0', '0', '4', '0', '1', '0', '1', '0', '4', '0', '4', '0', '3', '0', '3', '3', '0', '5', '0', '2', '0', '4', '0']\n" - ] - } - ], - "source": [ - "sentence = \"每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$\"\n", - "phones, tones = convert_sentence(sentence)\n", - "print(phones)\n", - "print(tones)\n", - "\n", - "phones = np.array([voc_phones.lookup(item) for item in phones], dtype=np.int64)\n", - "tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)\n", - "\n", - "phones = paddle.to_tensor(phones).unsqueeze(0)\n", - "tones = paddle.to_tensor(tones).unsqueeze(0)\n", - "utterance_embeds = paddle.unsqueeze(embed, 0)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 73%|███████▎ | 733/1000 [00:02<00:01, 255.71it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content exhausted!\n" - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEYCAYAAAB2qXBEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deZgdZZn+8e99Tm/pzr6QBAIkSIiAQoAMu4iyowPq8HPAZSIyZhwVdXAB1HHUmXHihrigGAGJjooMikRFAUFUZEvYdwgBTJAkkIXsvZzz/P6o6uSk6eUk3adPdff9ua66uqpOLU+nkzz9vvW+TykiMDMz21m5agdgZmYDmxOJmZn1ihOJmZn1ihOJmZn1ihOJmZn1Sk21AyhHneqjgaZqh2FmQ5WA9gGuw4eR260VgD3q1lCrHEIE20bAPre0jZdWF9QXtz7pDU2xanWh7OPvebD5hog4uS/uXa4BkUgaaOIwHVftMMxsKMrlUU5EWxsAMXMmDV9cTo0KfHPqtUzON5JXjkIUt55y+MnL+uz2q1YXuPuGPco+Pj/5qfF9dvMyDYhEYmY2VAVQpNjjcdXkRGJmVkK1dURrC6qtg5zIjR6FJL585y/Zu7aGuS+1cON/HwMRzLnhVArr10MEaFtP1lPFNX0YUWzX2skiJxIzswxLWiTZrkDiRGJmlmFB0BrlP2yvBicSMxu6SrqjcsOHs/SDr+WYf7iXr06+m1YKFCN4ri1PCzk+cdjpFFasBInhWghRpFBaq7CCdQvdIjEzs50WQMGJxMzMesMtEjOzaku7sHLDhxNbmlFDPRSLsPce6K/LufbhG1lRaOb1N83gifP35623tkIUO3RXrYRcPlkt9t8zi4Dtu9AyyInEzCzjsj3414nEzCzTgvAzEjOz/qb6elRTk5Q1KRTIjRuLcjmor+N9N93CpPzL7FqzmZ+tO5ATmh7ltCmHg3LMyD8IUSS6mgDYj11aWwUUsp1HnEjMzLIsmZCYbU4kZmaZJgr0SSHhinEiMbOBL63Qq7o6KBbJjRtLtLTCli1QKMDYUUShyD/96ha+f/wbKK5eS3HDBlRTy+/bDk9GZ0X6JKIa3VfdCKDori0zM+sNt0jMzGynBdAa2X6ZrROJmQ1Iqq0j1zQM8nkYM4porGfL5OGs2r+Ot87+I8cMf5xd8+sZkSvy/TWHUYgcP3j1VJR/Yes1olBA+fzWl1ZlrVsL2kukuEViZmY7KRAFst0iqVh0kmZIur9kWSfpo5LGSrpJ0lPp1zGVisHMbDAohspeqqFiLZKIeAKYCSApDzwPXAtcANwcEXMlXZBun1+pOMxsAEtrZKmmFtXVUty0CZRjy6mH8NLZG/nygT9nr5rVPNG6C8tbR3Hb2un8dck0Fr11Onc+27CtXlY6qoto29aNBclIrYxP0hgIXVv91V46Dng6Ip4DTgfmp/vnA2/ppxjMzAYgUYhc2Us19NczkjOBn6brEyOi/WnXcmBiZydImgPMAWigseIBmpllUTKzPdvPSCqeSCTVAacBF3b8LCJCUqdTbSJiHjAPYKTGZnw6jpn1pY4jsqivo2WXJuruf4Yv3vs7vvrCCDbM35dLPnAMhZUvlpz5MntzP20dy64Xs9+F1Z2sd231R4vkFODeiFiRbq+QNDkiXpA0GVjZDzGYmQ1IEapal1W5+iO6s9jWrQWwAJidrs8GruuHGMzMBqwiKnuphoq2SCQ1AScA/1Kyey5wtaRzgOeAt1cyBjMbIHJ5ck2NxIw9eeb0EdRuEM0HbuKCg37Hlqhl0bqpfHzSjXzi706jsGo1E/L3UGhr7fAWw8EnGbWV7RZJRRNJRGwExnXYt4pkFJeZmfUgEK2R7bnj2Y7OzMwoVGmiYbmcSMysOiRUU0tuWANqaiTGjmLjXqP469sKDHtaNI8LRt46jJ+/fx+KmzYRxU2cx9EQL4FyxBDo1oKBUSLFicTMLOOKGR+15URiZpZhQ/5hu5lZO9XXkx8zmsKqNUShwF8/exjnn3kNo/ObmJBfx1Mtk7jqb3/Hrt+dwuh7/kas30hh1WoK7fWypG1dWZG9cu+VEsjPSMzMrHeGfIkUMzPbeRFkfma7E4mZVUyuoYEoFJO6WZN3SdYLRZ59/3S2TGrj6rcdi17eQNsLK5KS7/yN4fE8bRIot/0bC4fACK3OVW/GermcSMzMMixwi8TMzHrJo7bMbMhoL/9eeHkdyud5+sp9+Ooh1zC99iX+vHlvXmodwUkjHuKMmz/AjPffT6HQYfSVckA6SmsIjczqTlC9V+iWy4nEzCzDAlxry8zMekN+sZWZDULtkwPT0VXK51E+R27XSaw7cCJ/vuR7NEcrh3/xUL53wfEUlv2NKBQggj9yKPuwiCidYNjO3VmvELhEipmZ9VLWWyTZTnNmZkNchChGruylJ5JOlvSEpMWSLujk8z0k/UHSfZIelHRqT9d0i8TMepbLo3yeXNMwihs2EsVg6acO42Pv+gUnNC2mUaI1gps2TWXes8dw0q4zAdgldxdtpbWySg3ZCYY7rq/mkUjKA5eQvLl2GbBQ0oKIeLTksM8AV0fEdyXtB1wPTO3uum6RmJllWNCn72w/FFgcEUsiogW4Cji9k1uOTNdHAX/r6aJukZiZZZp2tEUyXtKiku15ETEvXd8NWFry2TLgsA7nfw64UdK5QBNwfE83dCIxsy7lx41NurWGNxL5HNTWkHtxNY3Xis2PtXLNmW/gmiXTKW7clNTKUo6meGbbBdq7tcBdWTspGbW1Qw/bX4qIWb245VnAlRHxNUlHAD+S9JqIKHZ1ghOJmVnG9WGJlOeB3Uu2p6T7Sp0DnAwQEXdIagDGAyu7umhFn5FIGi3pGkmPS3pM0hGSxkq6SdJT6dcxlYzBzGwgay+RUu7Sg4XAdEnTJNUBZwILOhzzV+A4AEn7Ag3Ai91dtNItkm8Av4uIM9KgG4FPATdHxNx06NkFwPkVjsPMepLLkxvWgBrqoRgU16/nxfnj+dyrF7CxWM+mYj0vto3ghKZHOfdjH2bGr+6j2Na6rctKQjmBaonWlmSfu7N6LQJaI99H14o2SR8CbgDywBUR8YikLwCLImIB8DHg+5L+jaRn7T0R3f8gK5ZIJI0CjgHek34DLUCLpNOBY9PD5gO34kRiZtalvizaGBHXkwzpLd332ZL1R4GjduSalWyRTCNpDv1A0oHAPcBHgIkR8UJ6zHJgYmcnS5oDzAFooLGCYZqZZVfStZXtmRqVTCQ1wMHAuRFxl6RvkHRjbRURIanTJlM6XG0ewEiNdfvYrFIkcsOGoalTaJ48gr+eUMdV//gN9q2FE86bxSU3H0lxw0YkEW1t/L7tMBp1N+Tz23ddRRBtbVX7NgazoVwiZRmwLCLuSrevIUksKyRNBki/djkSwMxsqGsf/ttHD9sromKJJCKWA0slzUh3HQc8SjJCYHa6bzZwXaViMDMb+Pq21lYlVHrU1rnAj9MRW0uAs0mS19WSzgGeA95e4RjMrKNcHooFVn7wSC77+MUA3L5pOgvX7cnzN76Gz+x/LMVNmxhRs4hCWxtIhLuxqqaM0idVVdFEEhH3A53NsDyukvc1MxssIqDgV+2amVlvDOVRW2Zm1kvtM9uzzInEbLAreR1ubtQIaGll/fH7csu3vsMBt7+Wzx7x98SWLcSWZorN69mTOyimz0OikL761jPUq2pIPyMxM7Pe2Ynqv/3OicTMLMtCtBX7ptZWpTiRmA02Esrnk24p5VBO5MaNpbjrBJYfPpL8m1axdl0zb54yiz30KG3FtPsql9/aBeaii9nR/obELHMiMTPLOHdtmZnZTvMzEjOrPAnV1JIb3oQah7F5/11ZcUgd+576JN/a85fUSjzcMoLbN07n8geOZMJVY5l+3xoKEWzXY5J2cUV7V5dlhhOJmZntNM8jMTOzXvPDdjPrc/mRI5P3gaQTBtU4jJePnsryw3IUxreQX1XkpbnTOPvGNxJtrclIrGKwNw9CFNnaeeVurOwLd22ZmVkv+GG7mZn1mhOJmfUJ1dSQa2xETY1snLk7dS+3sOKTLcw/8EpWFRv59BNvpe72XZj8W2h47FmKa9YSUdz+Iu7KGnD8sN3MzHqt4DLyZma2s2KwPGyXNAk4lOS5z8L0fexmVmG5hgaiUIScyA1rIPaYzKbdRrB+jxoOOPtx1n1rfy74f68j2loZnX+WUcVnUE4USEvAt5eD92txB7TIeCLpsb0k6Z+Bu4G3AWcAd0p6b6UDMzMzIH1GUu5SDeW0SD4BHBQRqwAkjQNuB66oZGBmZpbIeouknESyClhfsr0+3WdmfSjX1ITq6iisWQPA8n87kuPfdSfn7/JnahEvFoNr183k9ytfTcsNu/PC65oZHYuIYoByyddigY4DtWxgGyzzSBYDd0m6juR7Oh14UNJ5ABFxUVcnSnqWJPEUgLaImCVpLPAzYCrwLPD2iFjTi+/BzGzwiuy/FqacMWVPA78kSSIA1wHPACPSpSdviIiZETEr3b4AuDkipgM3p9tmZtaFIip7qYYeWyQR8XkASY0RsakP7nk6cGy6Ph+4FTi/D65rNjBIya+YuTz5cWNRXS3RNIw1syaw8pRmnnzjzeSVY8afD+ChDx/AP92V227UVY6l7MpSQgLlPMlwkAuy/4yknFFbR0h6FHg83T5Q0nfKvH4AN0q6R9KcdN/EiHghXV8OTOzivnMkLZK0qJXmMm9nZjbYDI5RWxcDJwELACLiAUnHlHn9oyPieUm7ADdJerz0w4gISZ32/kXEPGAewEiNzXgPoZlZ5WT9GUlZExIjYqm0XaYrqy0dEc+nX1dKupZkUuMKSZMj4gVJk4GVOxiz2cCSdmWppiYZWZUOq8rvtQfLvtrAp/b9HVuilkuXHMPI63fh1HcfDMBUPZSOxurin1sEhLu1BrsIKBazXSKlnOiWSjoSCEm1kj4OPNbTSZKaJI1oXwdOBB4madnMTg+bTfLw3szMujAYurbeD3wD2A14HrgR+EAZ500Erk1bMjXATyLid5IWAldLOgd4Dnj7zgRuZjZUDIaurRkR8c7SHZKOAv7S3UkRsQQ4sJP9q4DjdiRIs4FKtXXkJ+1Cce3LAGw8YT/a6sWtX/0Wv964hK/9+zv44e8PIjZuYtSWxaCnt7+AR2QZg2DUFvCtMveZmVkfC0RE+Us1dNkikXQEcCQwoX0We2okkK90YGZmlsh4z1a3XVt1wPD0mNIZ7OtIqgCbWQnV1CSjrFpb0h1iy4kHMu3fH+ffJt3ElsjzheeG01rIc9ruhwMwMreIQiHtvsrlt+/KynrHuPWP6NuuLUknkzz3zgOXRcTcTo55O/C55O48EBHv6O6aXSaSiPgj8EdJV0bEc+nFc8DwiFi309+FmZntmD76nUJSHrgEOAFYBiyUtCAiHi05ZjpwIXBURKxJ5wF2q5xnJP8jaWQ6hPdh4FFJn9ip78LMzHZYHz4jORRYHBFLIqIFuIqkbFWp9wGXtBfTjYge5/qVM2prv4hYJ+mdwG9JiizeA3yljHPNBiXV1kFORGsbyufJjRzO5ll78fzZLTz2uivJK0chirz72dEsn7Mb5z95LBFBtCzf1mUlEcXctm1PLrQu7GAv53hJi0q256WVQiCZxrG05LNlwGEdzt8HQNJfSLq/PhcRv+vuhuUkklpJtcBbgG9HRGtXZU3MzKxv7UTRxpdKqq3vjBpgOklx3SnAnyS9NiLWdnVCOV1b3yN5b0hTesE9SR64m5lZpQUQKn/p3vPA7iXbU9J9pZYBCyKiNSKeAZ4kSSxdKqeM/DeBb5bsek7SG3o6z2ywUU0N5PPkhjcRm7fQdvA+nPn933H4sGfYEnm+9reTWHPVvpx65sHbTsqtQ/mNRKGAcmnZ9/YuLNfKsjL14VsvFwLTJU0jSSBnAh1HZP0SOAv4gaTxJF1dS7q7aDll5CdKulzSb9Pt/dhWK8vMzCqq7yYkRkQb8CHgBpKaiVdHxCOSviDptPSwG4BV6etD/gB8Iq1I0qVynpFcCfwA+HS6/STJq3IvL+NcMzPrrT58Kh0R1wPXd9j32ZL1AM5Ll7KUk0jGR8TVki5Mb9Imye1xGzLyY8bA5AnQ2gY1edbvO5YP/s/V3LdpEz8/4/X8fOk+FJubobiRiYW7tj+5WEj+DygW+rJ7woaSPp6QWAnlJJKNksaR5kRJhwMvVzQqMzPbJuPjZMtJJOeRvEPkVem44gm4RIqZWT8a4C2SiLhX0uuBGSTfzRMR0VrxyMwyID9jbw7+2RNMqXuShlwrhchx2bNH8aNjD6Nt+QrQ4u2H1HQ2c8yl4K23BnqLJK3NciowNT3+RElExEUVjs3MzGDgJxLgV8AW4CHAjwvNzPpT+4TEDCsnkUyJiAMqHolZNSWvhCbX2Ehu9Cie/4ep3H/Bd7hq/XP88K0nsOjp56BQIAoFRupZ2qLoCYXWb7L+RoFySqT8VtKJFY/EzMw6FzuwVEE5LZI7gWvTd5G0kjxwj4gYWdHIzMwMABUHftfWRcARwEPpjEezwSOXhyiimlrICdXWsPLEPWkeDSftOjP5nKeTkVnpX3/l28u/u1vL+kEVWxrlKqdraynw8M4mEUl5SfdJ+nW6PU3SXZIWS/qZpLqdua6Z2dCwA5V/q/RQvpwWyRLg1rRoY3P7zh0Y/vsRkuJg7V1hXwK+HhFXSboUOAf4bvkhm5kNMRlvkZSTSJ5Jl7p0KZukKcCbgP8GzpMk4I1sK1s8n+QF804k1j9yeVRbg2pqKG7cyOIfHsDlR8xnv7r11CKeaK3nXXdOY9r30t/sigVUX0+0tgFJ91a0tVX1W7AhaKAnkoj4fC+ufzHwSWBEuj0OWJuWMobkBSq79eL6ZmaD30BNJJIujoiPSvoVnXwbEXFaJ6eVnv9mYGVE3CPp2B0NTNIcYA5AA407erqZ2eAwwCck/ij9+tWdvPZRwGmSTgUaSJ6RfAMYLakmbZV09ppHANKX1c8DGKmxGc/HljnpBEMiUG0dqqslN3YMMayewtgmVv/7Zq577Q84+aIj+cqn30Tb0r8lNbEkXqWHttXPyuWJ5uau72PWD5Tx/wG7HLUVEfekqzMj4o+lCzCzpwtHxIURMSUippK8zvGWiHgnyRu32qsHzwau69V3YGY22GV8QmI5w387e63ue3pxz/NJHrwvJnlm4jctmpkNYN09IzmLZHTVNEkLSj4aAazekZtExK3Aren6EuDQHQ3UbDvSKwsQte/L5VE+T7S2APDkxQfxsTdcz6vqFrG20MjDm6fw+68exXv/4Xgmt91NW+korAigCEonHLoEvGVA1ru2untGcjvwAjAe+FrJ/vXAg5UMyszMSgzUh+0R8RzwHEl5FDMzq4a0kZxl5UxINMsM1dah2hqQKG7egnIiCkn3k/J5VF+PGhspTJvEr6+9klrl2fd7R/KbM44glvyVYkvycs9RcReRz3d+E5eHt4wZyF1bZmaWBRlPJN2O2koLLv64v4IxM7NOZHz4b7ctkogoSNpTUl1EtPRXUGbtck1NUCigpkbI5SlOnUTLiDpyrUVe+vhmrj/oMsbm68mRozUKbIhWlrbVsmDdQbx5t0MA2LP+HgotLUmXVcloL9fMsoFAMTi6tpYAf0mHAG9s37kD1X/NzKw3BuqorRJPp0uObcUXzcysvwz0Fkl79V9JjRGxqfIh2ZDVYZLhyg8cyYnvu505Y29jVbGeJrXx6/UHsGTzePIKVl92EO/9x+MpbtnyiusonweSrqvtamX5JZ82AGW9a6vHEimSjpD0KPB4un2gpO9UPDIzM0tk/GF7ObW2LgZOAlYBRMQDwDGVDMrMzFKx7YF7OUs1lDWPJCKWSts97PFsLes9iVx9PWqoh/p6Wl+9Gx+7/Cec3NhMIYrMvHs/7v+X1/KhhxuhWKS4tYsq6coaH3d0PuHXbzG0wSbjXVvlJJKlko4EQlIt297BbmZm/UAZL5FSTtfW+4EPkrwS93mSd5F8sJJBmZnZwFHOqK2XgHf2Qyw2RKi2jvwu41l65lROedftnDhyIbUqMH/l0Xzz5FP5+uJnANit5kkAiu6msqFuoHZtSfoW3YQfER+uSERmZrbNAJjZ3l3X1iLgHpL3rR8MPJUuM4G6yodmZmZA5of/dvc+kvkAkv4VODoi2tLtS4E/9094NihIoByqrSE/aRee/MAUirtuodjSwv3/egAPPT4qKQmfL1Lc9MzW0zzyyiyV8RZJOaO2xgAj2fZ63eHpPjMzqzAxsLu22s0F7pN0paT5wL3AFysblpmZbdWHXVuSTpb0hKTFki7o5rh/kBSSZvV0zXJGbf1A0m+Bw9Iwz4+I5T2Ha0NSe72s9npXyqG6WlRXh0YOZ+P+kxi+VIz8cy2NtzxMcfPmZHarclDM+K9dZtXQhw/bJeWBS4ATgGXAQkkLIuLRDseNIJkzeFc51y2nRQJwKPA6ktIof1du0GZm1gf6rkVyKLA4Ipak75i6Cji9k+P+E/gS7WUkelBO0ca5JJnp0XT5sKQeu7YkNUi6W9IDkh6R1F5FeJqku9Jm1c8keQSYmVl3+i6R7AYsLdlelu7bStLBwO4R8ZtywyvnYfupwMyIKKY3mQ/cB3yqh/OagTdGxIa0tMptaRfZecDXI+KqdATYOcB3yw3YMkhCdXXkhjXAhHFowya27Lsby95Qx0mnLOK9424hp+DZ1rF88alTmfSpOnLrNlEoFJIuLYBigSi6hJtZZ3awa2u8pEUl2/MiYl5Z95FywEXAe3bkhmUVbQRGs23U1qhyToiIADakm7XpEsAbgXek++cDn8OJxMyscwGdVyft0ksR0dUD8ueB3Uu2p6T72o0AXgPcmhbqnQQskHRaRJQmp+2Uk0j+h2TU1h9IRqIdA3T5pL9U+mDnHmBvkgc8TwNr2+ek0EmzquTcOcAcgAYay7mdmdmg1IfDfxcC0yVNI0kgZ7LtF3si4mVg/Nb7SrcCH+8uiUB5o7Z+ml6s/SF72aO2IqIAzJQ0GrgWeHU556XnzgPmAYzUWA/nyRqJ/PjxFPeciFraWHHUGF4+agunzHiUE0c/xC3r9uOZOw/h0U+8lgvvrqO4eQtEkVF6hmIUk1+w/LZCs/L00T+ViGiT9CHgBiAPXBERj0j6ArAoIhbszHV7TCSS3grc0n4DSaMlvSUifrkDwa9NWzRHAKMl1aStko7NKjMz66AvJyRGxPXA9R32fbaLY48t55rlDP/9j7S5037htcB/9HSSpAlpSwRJw0jGLT8G/AE4Iz1sNnBdOYGamQ1ZA7XWVonOkk05500G5qfPSXLA1RHx6/T971dJ+i+S0V+Xlx2t9b9cHuVEFAqv6Ipacu7evPr1Szhi7BIuf+QIRt42nCVfmMKly+spbtrE9OIiiCLF9vMkiKK7tMx2RBUTRLnKSQiLJF1E8rAckpda3dPTSRHxIHBQJ/uXkEyKMTOzHihdsqycrq1zgRbgZ+nSjN+QaGbWfwZ611ZEbKTM4b428OVHjtw6wgrlyDUNI1rb2PDzidx2wC+2HtcaBfb78eG0vG84ty6bwF6tjxNtrUndrO26skr+ZrtLy2ynZL36bzmjtvYBPg5MLT0+It5YubDMzGyrgZ5IgP8DLgUuA1zDwsysvw2CRNIWES5hMlhJ5IYNQ/X1EEWitY0nLz6IP/3916iTeLGQI0cw+3Mf46STZ2536qtqFlJoL/3e2Wgsd2WZ9V6AdqxESr8rJ5H8StIHSGamN7fvjIjVXZ9iZmZ9ZcA/IyGZNAjwiZJ9AezV9+GYmdkrDPREEhHT+iMQ6z+qryc3rIHY0gz5PNpjV1bNGs/opzbyyR//mA/NP5D37XM8xZbWreeMiTtfcZ1oa3vFPjPre1lvkXQ5j0TSJ0vW/1+Hz/zOdjOz/rAjc0iqlHC6m5B4Zsn6hR0+O7kCsZiZWWcynki669pSF+udbdsAoPp6ciOG0zZjd/52WCP/8f7/ZXrdShasm8mPHjuUGXs8y1f3/zv2bLubYqFkpLdyHoFlViUi+11b3SWS6GK9s20zM6uUjP+P210iOVDSOpKEOCxdJ91uqHhkZmYGgDLeI9BlIomIfH8GYn1HNTXkRo2kuG4D5IQk1FBP88F7s+LQemqPWM365+u44rhjKKx8EUnsxZOsjCBa05FY29XIckEDs6oZJGXkzcysigbyMxIzM8uAwVAixQYCiVxjIxreRNtek1lxSBMbj97A74/8Dg0S64vBl1ecwDMLD2Cfz+WZ9OQjFJqbiWJALh2EpxwU3Y1lljlukZiZ2U4Ld22ZmVlvOZFYpeXHj0O1tbROm8jqfRvZNEm0jCnSdMdw5sw+jmhpSQ5UC9NZlPydzGlrrazIeP+r2VA20CckmplZFmR8Hkl3tbZ6RdLukv4g6VFJj0j6SLp/rKSbJD2Vfh1TqRjMzAYDRflLNVSyRdIGfCwi7pU0ArhH0k3Ae4CbI2KupAuAC4DzKxjH4CGBcqikW2rZhUdyyT9fSoNa+fnaWVz3xAHU39PEjO8sp+2Z54guJha6O8tsgBgAExIr1iKJiBci4t50fT3wGLAbcDowPz1sPvCWSsVgZjYYqFj+Ug398oxE0lTgIOAuYGJEvJB+tByY2MU5c4A5AA00Vj5IM7OsyniLpOKJRNJw4OfARyNinbStAn1EhNR5r15EzAPmAYzU2Iz/MVZObsQIJBERaNeJrD1oPK3vWs2dB11FXjn2uvEQvjTzKIobNkAE0/QgAG0ZfzhnZuXL+qitinVtAUiqJUkiP46IX6S7V0ianH4+GVhZyRjMzAa0IBm1Ve5SBZUctSXgcuCxiLio5KMFwOx0fTZwXaViMDMbDIbyM5KjgHcDD0m6P933KWAucLWkc4DngLdXMAYzswFtSE9IjIjb6PqVvMdV6r4DmkSuvh6AYnMzAIs/8xredPxC3jZmEU81T+Lqv81i1U9259Q3HwzA9Nz9FKOYNGmlzE9cMrMdVMUuq3J5ZruZWcYN2RaJmZn1EScS60yuoQFqawFQXS2qrQWJTQdMoTAsx/9+8yIm54dxwLwjePz9+zJ3yUiKGzZCy/OM4/ltF2rv1oLMN3/NbOe4RWJmZjsvgGK2M4kTiZlZ1p13t04AAA9LSURBVGU7jziRVEUuD0Bx/XryY8bw5Lf24L2vuYMZDS/w0xWH0lKs4V/2P4Xi+vVMbbiXYksrhdIqi9sVYsz43zAz67W+7NqSdDLwDSAPXBYRczt8fh7wzySFd18E3hsRz3V3zYrObDczsz7QRzPbJeWBS4BTgP2AsyTt1+Gw+4BZEXEAcA3w5Z7CcyIxM8u4PnwfyaHA4ohYEhEtwFUkFdm3iog/RMSmdPNOYEpPF3XXVl9LJxWqqZHiy+uIYmx7+YdyKJ8nN3Y0e/9mDZ+Z+Beeah3GuV/+ILd9/rXctnJXonkTxS3Nr3xhiLuwzIYkBajvHrbvBiwt2V4GHNbN8ecAv+3pok4kZmZZt2M1tMZLWlSyPS+tpr5DJL0LmAW8vqdjnUjMzDJOO9Yj8VJEzOris+eB3Uu2p6T7tr+fdDzwaeD1EdHc0w2dSHpBNTXkRo1Malzl86imhhjZROu4Jop1ef7+WzfzthEPMzZXQ71qaY0CrRR4uKWeL5w5m3ff3QoSE3Q3hWKh03sUt2zp5+/KzDKlb1+1uxCYLmkaSQI5E3hH6QGSDgK+B5wcEWW95sOJxMws0/quaGNEtEn6EHADyfDfKyLiEUlfABZFxALgK8Bw4P/SFxH+NSJO6+66TiRmZhnXl/NIIuJ64PoO+z5bsn78jl7TiWQn5Robee6He3HVrMuopUgRsTFqeLZ1PAs37MVzm8byu9lHc8MTu1PcuGn7kVs5IR5LWqsREJ13a5mZAZkftelEYmaWZVG9Nx+Wy4nEzCzr3CIZBCRq9pjCihOmcMfnv02t8qwsbOSU//o4nzzrGKKtNTmu/S2FAFoLxVWvHGwRhVfMNTQz61a284gTiZlZ1u3gPJJ+50RiZpZ1TiQDm2qSP6JY8zITf1vgzZfP2tqFNV53Ex0nEm59W6FHYplZ7ykCFbKdSCpW/VfSFZJWSnq4ZN9YSTdJeir9OqZS9zczGzT6qIx8pVSyjPyVwMkd9l0A3BwR04Gb020zM+vOUE0kEfEnYHWH3acD89P1+cBbKnX/nSaRHzeWmkkTqdlzd576wWt50wMvcuRtK3nsfyaVdF0FqnXPoJlVWJBU/y13qYL+/p9wYkS8kK4vByZ2daCkOcAcgAYa+yE0M7Ns8qitLkRESF1XkEnr588DGKmx2f5TNDOrpIwnkv5+1e4KSZMB0q9llSjuF7k8+dGjqJmyG0yawObXTOHcm29k+D3DuP4dR3L76yezz3sf3O6UaO6xTL+ZWS/twPORwfaMpAsLgNnp+mzgun6+v5nZwBIM3UQi6afAHcAMScsknQPMBU6Q9BRwfLptZmbdGaoP2yPirC4+Oq5S99whufzW0u75cWN57MvT+K+jrmW3mjXctelV3LF6Ly7efya76l6KLe21tFwky8z6nx+2m5lZ7ziRmJnZTouAQrZ7Q4ZGIpHINTYmEwjr61FtLYUJo9k8pYnvf/vrvFgYxvsv/RA/+fyRtP11WZr9VwCZr95sZkOBWyRmZtYrTiRmZrbTAig6kVSNauvIjRyORo2kdeIonjlX/Och1zGp5mXu37IHi16eykcP/ntiSzNT4n7aNm/OfOY3s6EmMj9idFAnEjOzQSHjv+A6kZiZZZm7tqpASku815HfdSLNe03gufcVOWn6Y7x0xcH86LxDKaxZC4UCUVwPHd9waGaWNW6RmJlZrziRmJnZzqteMcZyDZ5Eksvz1MWzeMcxt3P2mDuoF9zbvAtXrTyMjZfM4OlfiAnNd9KW8R+Imdl2Aih61JaZmfVGxn8BdiIxM8s019qqjFyeXFMj0dJCfvJECuNHcs0vL+OQ+Ydy10dmce/DE4iNm5Ly77GWkXFntcr0m5n1TkB4QqKZmfWK55GYmVmv+BlJ3ym+7iBeeu0w2k5Yy5Uzr2T3mlYeahnJ4827csbUo9mr5j6Kzc0UMv6HbmZWtgiP2jIzs17K+C/HTiRmZhkXbpH0nnI5CscezObz16KWTdTcOI5/P/sEihs3QxSJYkCxjWhrq3aoZmZ9LPsz23PVuKmkkyU9IWmxpAuqEYOZ2YDQXv233KUK+r1FIikPXAKcACwDFkpaEBGP9ncsZmYDgueRvMKhwOKIWAIg6SrgdKDLRDLtNet43cV3cuc/HcjwBx4DwMXfzWwoCEi67/uIpJOBbwB54LKImNvh83rgh8AhwCrgHyPi2e6uWY2urd2ApSXby9J925E0R9IiSYtWrcp2NjYzq5gIolAoe+lOSY/QKcB+wFmS9utw2DnAmojYG/g68KWeQqzKM5JyRMS8iJgVEbPGjctsmGZmlRfF8pfube0RiogWoL1HqNTpwPx0/RrgOEnq7qLV6Np6Hti9ZHtKuq9LDzzYuuGBAxY8AQsqGlgvjQdeqnYQPXCMfcMx9o3BHOOefRXAetbc8Pu4ZvwOnNIgaVHJ9ryImJeud9YjdFiH87ceExFtkl4GxtHNn0M1EslCYLqkaSQJ5EzgHT2c80REzKp4ZL0gaZFj7D3H2DccY9/IQowRcXI171+Ofu8ziog24EPADcBjwNUR8Uh/x2FmNgSV0yO09RhJNcAokofuXarKhMSIuB64vhr3NjMbwsrpEVoAzAbuAM4AbonofkbkgJjZDszr+ZCqc4x9wzH2DcfYNwZCjGVLn3m09wjlgSsi4hFJXwAWRcQC4HLgR5IWA6tJkk231EOiMTMz65bH1ZqZWa84kZiZWa9kPpFkpcCjpCskrZT0cMm+sZJukvRU+nVMul+SvpnG/KCkg/shvt0l/UHSo5IekfSRrMWY3rdB0t2SHkjj/Hy6f5qku9J4fiapLt1fn24vTj+f2k9x5iXdJ+nXWYwvvfezkh6SdH/7vIEM/rxHS7pG0uOSHpN0RJZilDQj/fNrX9ZJ+miWYhwQIiKzC8nDoKeBvYA64AFgvyrFcgxwMPBwyb4vAxek6xcAX0rXTwV+Cwg4HLirH+KbDBycro8AniQpgZCZGNP7ChiertcCd6X3vxo4M91/KfCv6foHgEvT9TOBn/VTnOcBPwF+nW5nKr70fs8C4zvsy9rPez7wz+l6HTA6azGWxJoHlpNMJsxkjFldqh5ADz/YI4AbSrYvBC6sYjxTOySSJ4DJ6fpkkomTAN8DzursuH6M9TqSCstZjrERuJdkZu1LQE3HnzvJ6JIj0vWa9DhVOK4pwM3AG4Ffp/9pZCa+kjg7SySZ+XmTzD94puOfR5Zi7BDXicBfshxjVpesd22VVeCxiiZGxAvp+nJgYrpe1bjT7pWDSH7bz1yMabfR/cBK4CaSVufaSCardoxlu3INQHu5hkq6GPgk0F64aFzG4msXwI2S7pE0J92XpZ/3NOBF4AdpN+FlkpoyFmOpM4GfputZjTGTsp5IBoxIfj2p+lhqScOBnwMfjYh1pZ9lJcaIKETETJLf/A8FXl3lkLaS9GZgZUTcU+1YynB0RBxMUsn1g5KOKf0wAz/vGpLu4O9GxEHARpJuoq0yECMA6TOv04D/6/hZVmLMsqwnkh0u8NjPVkiaDJB+XZnur0rckmpJksiPI+IXWYyxVESsBf5A0lU0Wkk5ho6x7HC5hl46CjhN0rMklVHfSPLuhqzEt1VEPJ9+XQlcS5KUs/TzXgYsi4i70u1rSBJLlmJsdwpwb0SsSLezGGNmZT2RbJ3On/7GcCbZKgHcXkqA9Ot1Jfv/KR3hcTjwckkzuSIkiWRG6mMRcVEWY0zjnCBpdLo+jOQ5zmMkCeWMLuJsj7+scg29EREXRsSUiJhK8vftloh4Z1biayepSdKI9nWS/v2HydDPOyKWA0slzUh3HUfyArvMxFjiLLZ1a7XHkrUYs6vaD2l6WkhGSTxJ0o/+6SrG8VPgBaCV5Detc0j6wm8GngJ+D4xNjxXJy2OeBh4CZvVDfEeTNL8fBO5Pl1OzFGN63wOA+9I4HwY+m+7fC7gbWEzSvVCf7m9Itxenn+/Vjz/zY9k2aitT8aXxPJAuj7T/28jgz3smsCj9ef8SGJPBGJtIWpGjSvZlKsasLy6RYmZmvZL1ri0zM8s4JxIzM+sVJxIzM+sVJxIzM+sVJxIzM+sVJxLrU5IKaRXVR5RU+P2YpIr9PZM0VSUVmXfyGp/qsH1776Lq9l5TJXV8tanZgOZEYn1tc0TMjIj9SSYbngL8R5Vj2qpkdnqp7RJJRBxZwRCm8sp3ZJsNaE4kVjGRlO6YA3wonQmcl/QVSQvTdzn8S/uxks5X8m6NByTNTffNlHRneuy1Je+EOCQ97gHggyXX6PT6ko6V9GdJC0hmVlNyzlxgWNqK+nG6b0PJeX+UdJ2kJZLmSnqnkvepPCTpVelxEyT9PL3vQklHpftfr23vubgvnYk+F3hduu/feoj5T5J+o+R9PJdWsmVn1ivVnhHpZXAtwIZO9q0lqZ46B/hMuq+eZMbzNJJWy+1AY/pZ+yziB4HXp+tfAC4u2X9Muv4V0tL+3Vz/WJKCgdPKibl9Oz1vLUkZ8XqSmkqfTz/7SEk8PyEpoAiwB0mZGoBfAUel68NJihgeSzpbvoyYt5DMYM+TVEk+o9o/Xy9eOls6a+abVcqJwAGS2mtWjQKmA8cDP4iITQARsVrSKGB0RPwxPXY+8H9pna7REfGndP+PSBJRd9dvAe6OiGd2IuaFkdZSkvQ0cGO6/yHgDen68cB+SbkzAEYqqcL8F+CitKXzi4hYVnJMT38m7TEvSe/9U5IyONfsxPdgVlFOJFZRkvYCCiTVUwWcGxE3dDjmpL66XRfXP5akRbIzmkvWiyXbRbb9+8kBh0fElg7nzpX0G5KaZ3/p4vvsLuaO9Ytcz8gyyX2uVjGSJpC8lvbbEREkbxP8VyXl7pG0T1q59ibgbEmN6f6xEfEysEbS69LLvRv4YySl59dKOjrd/86SW3Z1/Z60tp+zk24Ezm3fkDQz/fqqiHgoIr5EUsn61cB6klchlxPzoUoqX+eAfwRu60WMZhXjFon1tWFK3n5YC7SRdD21l7W/jGTU0r1K+nheBN4SEb9L//NdJKkFuJ5kJNVs4NI0wSwBzk6vczZwhaRgW1dTl9cvI+Z5wIOS7o2kZPyO+jBwiaQHSf5N/Ql4P/BRSW8gab08QvKu7yJQSAcKXEnyrpOuYl4IfBvYm6SM/bU7EZtZxbn6r1kGpV1bH4+IN1c7FrOeuGvLzMx6xS0SMzPrFbdIzMysV5xIzMysV5xIzMysV5xIzMysV5xIzMysV/4/Qbezy38fbnwAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "outputs = synthesizer.infer(phones, tones=tones, global_condition=utterance_embeds)\n", - "mel_input = paddle.transpose(outputs[\"mel_outputs_postnet\"], [0, 2, 1])\n", - "fig = display.plot_alignment(outputs[\"alignments\"][0].numpy().T)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 合成语音" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "合成的语音会保存在 `syn_audio` 目录下,使用和 reference 相同的文件名。" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "time: 19.793312788009644s\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEGCAYAAABmXi5tAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO2dd5QVVbbGv90BkJxzaDICAkKLGJCoA7QDKhgY8zx1zOm9x6DOGEcFdUwzijJiHBV9mBiJoiAgQYIgWQGbnBGQTHef90ff6q6urlynUtf+rcXihrpVu++t+uqcfXYgIQQYhmGYsk9a2AYwDMMwwcCCzzAMkxBY8BmGYRICCz7DMExCYMFnGIZJCBlhG2BE7dq1RVZWVthmMAzDxIolS5bsFULU0XsvsoKflZWFxYsXh20GwzBMrCCiTUbvsUuHYRgmIbDgMwzDJAQWfIZhmITAgs8wDJMQWPAZhmESAgs+wzBMQmDBZxiGSQgs+AzDMAmBBT9gVm8/hKFj5iEvvyBsUxiGSRgs+AEzddVOLNn0K976LjdsUxiGSRgs+AGTX1A4sn9y8pqQLWEYJmmw4AfMT7sOh20CwzAJhQU/QIQQOJHHvnuGYcKBBT9AJq3Ygdk/7QnbDIZhEgoLfoBs2X8sbBMYhkkwLPgB8s683LBNYBgmwbDgB8jOQ8fDNoFhmATDgs8wDJMQWPAZhmESAgs+wzBMQmDBZxiGSQhSBJ+I3iSi3US00uB9IqKXiWg9Ef1IRF1lHJdhGIaxj6wR/tsABpi8PxBA69S/WwCMkXRchmEYxiZSBF8IMRvAfpNNhgB4VxSyAEB1Imog49gMwzCMPYLy4TcCsEX1fGvqtRIQ0S1EtJiIFu/ZwyUIGIZhZBKpRVshxFghRLYQIrtOnTphm8MwDFOmCErwtwFoonreOPUawzAMExBBCf5EANelonV6ADgohNgR0LEZhmEYABkydkJEHwLoDaA2EW0F8AiATAAQQrwGYDKAQQDWAzgK4EYZx2UYhmHsI0XwhRDDLd4XAO6QcSyGYdyxduch/Lj1IK7IbmK9MVMmidSiLcMw/vHs1HUYMeHHsM1gQoQFn2ESgkj9n5fPbTaTCgs+wySEb9buBgAMHTMvZEuYsGDBZ5iEsXzrwbBNYEKCBT8gCtetGYZhwoMFPyBO5LHflGGYcGHBZxiGSQgs+CGy9dejYZvAMEyCYMEPkaMn88M2IbG0eWgKlm05ELYZDBMoLPghQmEbkGBO5hdg1XaOVmGSBQs+k1iIb7lMwmDBDxFivQmVv09fF7YJDBMoLPghMmsdd/UKk31HToZtAsMECgt+QOjlXXHGI8MwQcKCHxDbDx4r9dp/lm8PwRKGYZIKC35ALMn9NWwTGIZJOCz4TOK4d/wPYZvAMKHAgh8QAlw8LSp8voxdaYeOnwrbBCYEWPAD4okv14RtApNg9h4+UeL5kk3sYkwiLPgBcfhEXtgmMAlm58HjYZvARAAW/IRyKr/Ac43+k3kFOBLzG1lBAbvaosyq7QeRNXISvvyR3XAyYMEPmWMhFVBr/dAUPDPNfabp+ws3oc1fpqDzY9MlWhU817/1fdgmhEJcbnQ5L88FANz5AS+0y4AFP2Q27Dkc2rHnbdjn+rMPfbYSAJAXE+EwYs7Pe8M2IRRenPFz2CYwIcCCHzIZ6eEV1HF76B8284Jf3FmxjbO8kwgLviTy8gtcuWfSQqygtv2Au4W8GWt2SbZEPjsOHsPt7y8J2wyGiRRlSvA/WrQ5tGbhf/1iJU5/eKrjz+0+dMJ6I5/Yecid4Id5k7LLsDHzMXnFzrDNsOSjRZtD86cfOBrt4nEyruWnJq/B0ZPxDiyQSZkS/D9/sgIn88NpFv7h91tcfe6t736RbIk1SbgAth0orF20duehkC0xZufB4/jzJyuwfKv/nbfmri+9VnEqP9rrL+t2/eZ5H2Nnb8Tq7YfQ69mZaPngZAx8aY4Ey+JLmRJ8xh67PM4q/vHNekmW+M+AF6NzgXd4eCryVAOSLamexpe+Ok/K/s06eP0kQTyDZOrKnfhxi7x1hk37jiK/QGDNjugOAIKgzAn+6Cnxamrx9drdYZvAeGTtzkMlhNyIIyfz8Y3q9/506daix/ke3TpLN/+KnJfnYsbq6K+v2OHWfy/BiE9+9LQPZU1Nb3aTVMqc4L8Zgoskbhw/JTf2Py4x3bJ5+eufMWrKWgx4cQ4++2Gb6bbfpUTnb5P0S2zs99iM5dmphQOd+z9epr9BAn8ixXW5cluyR/VqYif4r8xcjzGzNgR6TCFEmRk5AcBr38r9/kZ+6m0kFjZuR9fPf/VT0Xe5+zdzN9nVbywEAGzef7ToNVItfsvqjXDouP76zAmdGcipkNa7gmJHqpxE1BengyR2gv/stHUYPXVtqVGqjBX9nJfnYMmm/SVee29+Lpo/MBk3vbvY8/6jwr7DxRfA3sMnMHzsAk/7+2Sp+ejWLrN/2mPqh5bBybzSIrfvsPdIqWdNspa1hcsUpq8qjiJa7dG3PH+jeRLdpB93lHrNalYSd2atK3SfxT05UCZSBJ+IBhDROiJaT0Qjdd6/gYj2ENGy1L+b3BxHXQBKKwxeskaBwjj6VdsPYeiY+SWyX0dNWetpv1FE7dMc+ckKzN+4DwePuS+X69X/rHDdm98XpdJ7QRv59PLXxVmlBSGE7V75+nzd1/eqbrwTlmzV3cZP8iIapaN3U3aDMoNatsX/KKigWbX9IP75jfNsac+CT0TpAF4BMBBAewDDiai9zqYfCSG6pP694eZYD322oujx0DElLyKvq+9rdxZHMfT7+7fIGjkJeyym6Qrf/7LfeqOIoiRRLc4N92+QuQ7w2H9Wl3j+/Fc/mW7vt+xt2HPE5yPIYfuBY5i6cmdouSwKSvSSlhN54dSdiiIvzfgZz003P6/1yJBw7O4A1gshNgIAEY0HMATAatNPucDsNPTqVvh6TelombU7D+GIKnv2l71H0Lx2pVLbyfaJh0HI1zjyJRngxl979lNfI3dUjpTja1lo4WoJE23+3DXjFmKj6ubk13ei5tVZ67F531Gc16o2BnSsj8z0NMMZo9NTxCg/8ERePspnpDu0NFqscZlfIsOl0wiAOutoa+o1LUOJ6EcimkBETSQcVyp6J4d2hP/vBZt0P/uNRWhl2CMmOxyVHLnjlHFzi90wXlxEVjkCQc/GrvS4PhIkB46WdOtt2uf/zOSZqeswftEW3PXhD1hkMct0LPjQV/zDBgvbcWLL/mOuPhfUou1/AGQJIToB+ArAO3obEdEtRLSYiBbv2bOn1PtmwurVpZOmc26ofb9ASVEqazw6cVWox//X7I1Fj3/Z615orH6j//m/5a73Xdb4h8YHrA0N/crnyDRtL4X1u80rx749L1f39X2HT+Cvn68s9brRCH9HzJvBeHFtyRD8bQDUI/bGqdeKEELsE0Iow+U3AHTT25EQYqwQIlsIkV2nTh0JptmHdM6O6I/L5WHXFbJ5n75/1Sv7VGLT//lvpUTO6GEVPpkkrEorPDnZ37acHR6ZVuL5w1+YDzpGT9UPoHh6ylq8ZzD71kOJ3okje347gSMnwhX8RQBaE1FzIioH4CoAE9UbEFED1dPBABLV4DUMj44QAqu2H8SW/UdtdaWy60XZ9Zv+6GjHQXdTTCN+PSq3ybY23Jaxxs/zVuYivRLhlP23Gba21xvcxYWznpyBERPc5714FnwhRB6AOwFMQ6GQfyyEWEVEjxPR4NRmdxPRKiJaDuBuADc4PU7WyEleTTXlE52wODcn/PKIhICt2/Ubcl6ei57PzCwR3eSVLw0ShLT+X6/0f/5bqfsbN/cX03PIKHSSKcZthrZeyO99RhnBHjDKdygrrEz1MFCXJz903Nl1J8WHL4SYLIRoI4RoKYR4MvXaw0KIianHDwghOgghOgsh+gghAg9ut1oQ2qjjN1ZnRSpYjUz0PhMGp/KK7fQSY6/lnfn6U2d1fLvT0dtug1mDTKxKJS/0YTFXXSvHDlt/PWoqqkIIPG3DzeImjv2gwQ1bCThYv/sw2v3VfvnvE3n5EELg+Kl8dH5sOu76sGSLwi+WGWcWy5pZGO3nu4jW1rn6jQWmLtNfddyuTpvTxy7T1g2vf7sBl782H7keFgMVWjw42fT9KPr9g0g0/PeCzQAKR4FW35GWS/75nR8m+cr47zdbbuO0feL5o2cW5QzcO/4HfLO25KLp8VMFeH32RstGO+obzcFjp2zNjo0ynJUcgoPHnIW7tv3LVDR/YHLRQOA/y7fjcEQa3ntN0vSL79bvw/cmA1O9333ZZmcehVgIvtewxqdT2bI/bPG/NZ+erU6nXTKYt6FYbL79qXTEk2zW7y5MXHOTybo94lETJ/MKcO/4kiPUkZ9au8ncnLfK4vnny7aXyi1RGux8vqzk69ow1k9VnztkMbs7fCIPu387bjgoUG5sM9e6O4cWbiwWML8W4tX4FVQQBfQSCL/92dnvEgvBN6p2ZxSvvcXArTJ6yroiP5gs7FzUst0Fi3L3W5bjfdpDSYj9R04ia+QktHIwUld+CuVvjcpoTga7Dh3H5yYuCIVnDKJI3KJX/wYoTFZS88vekuGMZqNELb2fnYnuT35tmN2amxLQySv1bdFD3fNY3dzlS4O/R4vwME9+YpL0fM9A+NVGtdTfdPIH8h2Wx4iF4BstxqhHsWpueOt73dd3HjqOi//hvVaL2qYLX5hd4vWNOmn06RKjAmat243LX5vvyJ/qlJdmFI4k8gqE7VGqMrK/8a1FACDFfeYUv1LvB6W6JFldlK9qqrjK9KSpK1tqW0zqNSS/4jV7i9BKPR/jm1XhX6Gc13bOh3dV6zwvznBe72WKg9aU2lmDOh9nwhJ3Xei0LNn0q2mOwDdrdxWVYl6z4xA27jHPJ9BD+X31gkcUlC5uapxGx8VC8I3u+EaZdFa1S8560l74lhFfrd5VFAKmPRFe+rr0CZ6ul9XlkhtSgmpWAdDsomzxgLE/V7mJfbS4+EIZZXPUqj2k3YgJJ7MIK/yYzs9YvQu/pWYrSzb57xI0Qp31qhX8+z4qnUzmZJQP2L85Ld9qPUM2qsL5rzkbdV/X4iTxTttfQB0wYKUDWSMnGQ4a1QwdM880auyPby9Gl8e+QtbISRj40hz0/bv7CDOrqqda7PweamIh+EYccLiQpGC3KJoRTsO/nPrwp6/aiQ8WWi8KZo2cpJuMstRkIcdsAXeTjmC+/q29ixQA3lRluSo3JivMblxZIyfhRF4+5v6811aYph/h1eqy2O/Mz3X0WaNm72bng5FI9X++eCbpJRPZKau3l3Sn2unsZYTd0N2JJr0BtMfX3lzslkJWsoqvHfc9Pvthq+sGNIr7OKxe2k6JheAXGHyX/wypt6rTWi+jHfrTb3lvCR78bIVlKCkAjJm1AVkjJ5UY2Rgt1Fm5Wd6dn6v7eq9nZ1raIQA8/qW5/7TL49Mdj5IX5/6Ka8YtxPrdhy2F7sHPSqfXO0HbEETb7N1p1I3RSDd37xHD5upOw+zMcBIea1QqWbugPuy1+fhimX919K2uLatzwK7wdn3iq6Lj3ffRcsM6WXrfYdbISUUDsp7PWF8bdtinueHIKhGtJRaC/8Zc/VFmWI0NnEainHRZd/zy1+ZjxITltjJl1dUmtTVSAOCNORvR+7lZpvtQYqO1rjK9kb8WOwlnB46ewlKHgv+hKvzxEYt6P14Lo2nFdt56/8L3jEaUMmcpc9bvtR3TbrbIrg2CuGe8/KSpIjssCptZ/TluEwCNSmgv3ax/vj742QrTnInJK+wvcgMl13/+POFHtPnLFEeft0ssBN9IcMxGMHoLHGq8FFuzqvmhZvjYBZ4yAD9evBXXjlvo6DN6NVKMeqlq2bL/KI75WDnT6dRX3bxits/hpdrRpZlrwQtCGC/k60Vi6GFnveLYyXws+MX7TStXp2rmI1+sRM9nvpHeJtFqMBV0mRLt4dRCbhZ1dPv7Sx0dR93iUr2GJptYCL7RoEfJjv1B5y583qhvTPdp1RRDFk4XYfSq/pn55BVaP1Q8IvAySoxaO7itv8qt0eME3wQfQJrBQv7BY6dsRcKs3/Ob5TavfbvBU90VhROnSov6O/M3Ycv+Y3hB8nVkJfhBdyzbdajkrE/t1otj5dV4CL6Fgl366jzH+/Sz5s2PW93v20nVPyO8FIeys25gF73mH3ZHsGEQlJTk5RfoiqjCBzayeD//wfpmJKu1370fGbtwtKGoXrEab9gRfJm5Nn9XdZU6eOxUqXWduBELwfeCUVyrn2VytXX0g8aLG/jrNfJqoL+uqnGvdAXz2h1MO+KKI/+asxFjTUIUt1u4I4MmyCS6PKMIjRR2Zl1uc230XMTqa6nzY9NNawDpkV8gInWTKPOCr5eU4jcxaHBlyLRV+oLvpkyA8pnPf9gmrRm8nQVkt6j/Rtm+aTXTVu3CMRMRWL6l+Jw1EtuFEnzzfuC1DMpEC0Hd6qDTU/kMZ/KmW/rb4yL6C1/9hPYPT3OVFPibDyVZYiH4ezwsevp54Rolbfih904uJDd9Xa1o+9eptqKF1MxcV7jI+uy0ddLs0HPF3freEtyjqXXjhhOqUDi3FUbthuyaud3mqqo5dnxkmm4o665D0SkFrI5WOWJR2M0KqzUkdRKmVRG5IV0aOjq2XoZvuXRvEvnPmYWh4z/vcp59m/OyvZmKk7LVsRB8LzGp75skMHkdjRitxGtjamVw6Ji12G5N1UPJ9WEUfDKvwPXNUxsx5eV7f312aZfQ1FU7HU+19Xj7u1wAhQv+Rs00rNwbtvvAOvgK9gRQPtoL7xmUzHaDdX/o4sdWN+WPFzsrT62XR7J2p/XiuBFqV46bQavdUutOAi1iIfhumWuRKLMhVfPC7dTJKOZXGYXaSdu2i51QSbcNKuxiVMrCKV7us3sPn8RN79jL4nXK8dS02yyk98DRk6bhwJk2R4RO+iaY9el9wiLZLQjUf4ud9oFmNYmsEvOiXllVzXHVwvxjJjkkXgee9zrIiyjTgn+NRfz6sZOFP4gfkYgFBQJ/+Jez+HkzptqoVvj0ZH/7ynR+fLqU/SiRFm5zIWasKRYVbc14vxHCfESleGqsImR2Olh8XpRrLIJWTduDQB1ZZjUTFUJ4KkPgd0e55VsOGFbbdYravWc2616w0Vtk3AwHgRaxF3wvd8e3vvPvYpmy0n7FvzdtXLRGcdtqvraYDkcF5ReTEY3yx7cXW29kE7vzl+MmC3CKb/6SV+LX1EUGVus8i3J/lTRPBMbMMi6t4rT8icKQV77DEEm/nTbiyGhwcuxUcFE8sRd8L6PzU6kPe51S6XHHB/Yz7axq0ADWjSyiil7Ti/mpjkNR6yX9+bLtlsXBXp21HlNM0uY/shFDXxZRBPanXeY+72On8j1HvigYtdsErEs0mOG2kJoW7drjH99erHsjmv1TcC0XYy/4XmqgV0iFbcUhjPK56fYyGv2MSnLDUZ1ICiV7doJJ7e+weMYioujD77folq5QWLXdfcmOOKMUVOvRopbpdkIIaWtBfuI0Is0uz00vfX69PS/Xl2PpEXvBV/tznfJzqpZ90OnafvLqTLmZj155wKQVoFVj8TCwU/foLzrlLxTi4laTzf0fF5YZsJq1vTd/k5TkOatZ+bRV3s4to+qhTtAzURvF5bXgn1MiL/hWP+zdH7qPv1YW1nbEZOXfTrnbCUv9K7zkBnVMucKDn61w1RVIjVsfrRVGNewZexiVhFbYf/Qkpjhol2iE1RrZiE+81RBastkfIdbK2XMWM0ohBPLyC2w1ordD5AXfbPosC6+jgaCwU4dmi4NMxDDx0hUIAI6czLPVBzQM/LoZxQEr//eW/cekuHRe91iiwwoZwQCfLLXhsrT4Ko6ezPeUC6Al8oK/1aC5skzGzrbf1SlMth7w/7uIC8PHLvBl0VdGOrsfQQB6PK/jDw4TO+tHew+fwGoPpcmBwnwTp639wkAJTlCjPWetgu+I3NcG0iPygv+5hAxKKy7qUN/3Y8igesVyYZsQGVZtP+RL/oRRLSEnBDXAf9lmxzeJLZVNmbdhH07LTLfcziqb1gqzhuJRQs89OHnFTkdu6I0WfXkV7A4yoi/4Fj5BGfzHp7rnsikoEKiQGfmfLDCiutgeNbuCugFNWbEDV5zVxHK7tvWqeDqOUStOLVVPy/B0HL9QV/zU9gzWYrUmomC3T3fk1aNh9QphmxAZ8gqEYaekJCIjksIPJpl0QirLjF+0BRk2phOdm1TzdBy7NXK6Na3h6TgrPLqNvrfRW8Iq6zgj3d71brdXdOQFv3tWTV/373f9GZmczCvw1NykrNHj6a/DNkGXn2PicvADIaxFymtFTbt4LSLoR3liLQ9f3MH0/de/tbe+eJvNloqRF/zW9aqgUnlrv6BbZFb685sRE5YH2oyCcYfHirqx5t0FmyxnXkHNgH7Za7NyaYgoCWtBEflT89GJq3DkhH8jgicn22vu7TdCCORanKBxiExggB9s9CAuq/hd3CxI3vexTMaW/UcxfdVOLJSYeLV+t3X4ZuQFP6dTg8CiDMJk6eYD6P3crLDNYCSQ5Dj8soSfM5GhY+bhlveWOO7KZcbN7y6x3Cbygl+vajIWbYeOcd6InYkmMkdtTNlE6al9wkNzJy12XFiRF/y8fBFYWBnDMEyc+U6nlIkaKYJPRAOIaB0RrSeikTrvlyeij1LvLySiLLv7jlr1R4ZhmKhy9RvmTZc8Cz4RpQN4BcBAAO0BDCei9prN/gvAr0KIVgBeADDa7v4PHItmvRSGYZgoUq5+q25G78kY4XcHsF4IsVEIcRLAeABDNNsMAfBO6vEEAP3IRkB5QYEIvHwowzBMWUVG7nEjAOqavFsBnG20jRAij4gOAqgFoITDiYhuAXALANSs1xAtHpwswTyGYRgGiNiirRBirBAiWwiR3axRAzwzrFPYJjEMw5QZZAj+NgDqikmNU6/pbkNEGQCqAShdO1RtGAFXZDfB8O5NJZjIMAzDyBD8RQBaE1FzIioH4CoAEzXbTARwferxMADfCJv1PKtXzJRgIsMwTDI4uXO9YQaWZ8EXQuQBuBPANABrAHwshFhFRI8T0eDUZuMA1CKi9QDuB1AqdNMImZloDMMwZZlnhpq7waUUjBZCTAYwWfPaw6rHxwFc7mbfGWmENAqupjfDMExcueKsJrjS5P3ID5+PnyqQ0gMz6tx/YZuwTWAkUbMSdyZj7CHTg2FnX5EX/DfmbkR+xDoI+cFdfVvh3T92D9sMRgLt6nvr6MREg+7N/evFMeHWc3B775ZSa+l8cHMPy20iL/hPDOloq0+mW/7Uq4Vv+3YCEeGCNnXCNoORQPsGVcM2ITRqlKEgi9t6t/Rt39lZNTFiQDvUqVJe2j7PbFLdcpvIC/6Bo6dwzMeuVLf3auXbvmUz5Z6evjaDYeRQwccBStR5KKe9pWuhV0ADm3pVvYlp+QA62Tw06HRp+0qzUUc+8oI/x6L6m1eqxWhEUiEzHSj73i3bzBnRJ2wTdKl2WnzOKdmkkXXJX69CbBevM63MACIER09da/r+1Wfby0MaddkZtraLvOD/sje5/UG1ZKQR672KqI6kr+nRLGwTQuG8VrVsbbftwDFPx7mofb1AjpPdzFsT9M42XCx7UnXxjahYzt453qutvVlT5AW/d5u6vh+j/+n2TqCwSUsjHA2oAXQcSI9oK7So2RWUPTed3wITl2+33O679aZJ9pbccG6Wre1+2uVtsGijvqMpRq6tMxpVK3qcnWV+U+nZ2p6Q161ir1FU5AX/iuwm1ht5ZO76Pb4fQwZHuIF5EZXKpfvS+vKcFvZGqWYEpfddbIwggeBaLnbLqoG9h81HrADQsZE3V0snm3932OgVE7iwfT18evu5qm3M92F1Q1Cwe1OPvOC3qV/Z92PEpV5P9QT7hrXM+O9evghZoxqned6H15GhXT6/47xAjmOXqhUyLQWsftUKnhdtK5fPQO3K0c91aFG7tHZlpBEyHSwGCwG8dFUXaTZFXvAz0/w38UKbPsGwqVU5mMWuIHjrxrM8fb5ulQqRTXCKmEcnUKxCqGtXKSclkXLEgHae92HGi1d6F1m9kG/tWMBqyHJaZjoGdmzg2RaFyAu+VajRf3vIUFUEo229eCTK2BGSm85v7r8hDqhSoXT1jicv7Yg+bb2tzaSnkS8jaRk5fkGN8KPIZV0bm75fISMd57b07ja79MxGpu97za+RETqaZuM8GN7d3GWdlkYol5GG3FE5nu0BYiD4Vlx3Tpbrz/Y/vVB04nKB2rHz3oiVaNBzOygjvCiGL2amW3/Hj/5e28GzmK5N4+Ffls0DAwtH3JV1bvBqbuvdEq0lDLCs3CJe82vSbZwHVuhdrlUrlDznLz3T/AYpm9gLvpdEJCXiJQ5T8Ft72cv6q1xeSj08aejZowj9C1d2DtocSx7KMU+EaVW3MsqbuC06NU6m4CsDr/kbzCNwMtPTIGIQXKwVZlk8rDNYCDIzO/aC7yXkrCA1f/ejONtfLIRDzTU9rBeNG1SzF3YVNepVLW33RR0K10yiViJpSJeGqGJxob91w1mm7qgbz8uSbFU8OC0VL251nsqcTPdsXdvSnjDJ0MxCbu3VEhXLlR4A/c/vgpuVx17wvbhjrlfcQT6M8J0k3zw+uKOUY8qsy+Enytcto7lNGFVGa1Qytlu5yJ9NaHvO2haBBee0qCUtW3y0Se33ci6zZJ+69Ax8cLO2Jbc7ymkEf+RA/YXmIF3KsRd8M6zSjf2M8qiQmW74A2uxUwPDzkLXBzfJOVGNWPKX/lL2o5zg3Zq5q0ao9v3f3a+1FJvsQgRkmESOFaRCRS+XmD9iti5wto8VHe0y6Iz6RY8zLHzfGelpts53I9SDGjuLok75w9lNcW5L45mDE9R/ZrNaFQ23u8BmcpURTWsa71tLmRb8qyzi65vVqgQAqGqx0GSE0aJjq7qF8bd2/e52sJNJp51CykbWBeZlL+XS07D8kYuk2KHFjl2Vy2eYuhHt5gY4Kb07rJvxzeOjP51jez9+0Vm1bjGgY32TLQupWdF4oGXlEurW1Fu5gyBRzzJuM9ECr5nQ6kQuK8q04APAFdnGq6HK398AABgNSURBVODKD+J2SvVng1hgJ3dcu9hZnM5KjSL8mrm4jVzQum68jPCuP7e0q6xF7UpSFr6uS6Xs547Kwbf/21t3m+omYgXor1no4sCtkVVb/vkkE3WIpJ1Bgdnvf+055q5Q9eSqosU1oZ552GHEgLalXvPS20C9HtRRVU7BLlqXkBGVdNYFjIiF4Deq7j770c8CW38wqGTnh0fOzuhduXH5ccPZ+NQgx5ELfVIFnW48V15uwEUdSl/EX93fC1/edb7nfaszmW0Ltwa7i4UFJivW6hnnjPsvQI/m3uPW/aSu6rvy2rvC6oahDrCwOh+XbT7g6Nj92pVOwDyV761ByZAuDQGYu3SMmGUw6NDiZIE6FoLvBS83C7fEJKxfF6NKhG5G5coN6J7+rdGiTiVPdinU0VkUTE8jT7MGBfVMz8+BwlkW9VHUo9xWdavo/m2/79xQul0y8OqeUHJjjKjqIHdj+8Hjjo6tuGLVeF1fHnVZJ8z9cx/L6C89GvqgXWVe8G/uGXxHK5kLdm7wcpKeKdFHOlglSt/8d28A9kJQzciqLefGESZ392ttur5TuXz0EtKCotpp5i4zO/Xhn7vcXX6H3s1KPRGb/0Bf9G3nLEP8tHLpaFwjOi65MiH4b96Qbfie0cjPz5LIv9NxO9ilm8ca3AA8BbjbWXSzyyU66e8NqgU/47JLUBOzKhUyTRvv/JeN8hh2Uv9lzW7/MfxMw/cUl4UsrCYIdtYIhnWTl72qDq+uX7WCa1dfVIiF4OuVGQWAhqkV/b46vrf/u9U8esFuJ5mgmaBjt53KgD8/ObDocUDVcAOhsYTqlW6RUSpZjzQqDt/Ue89ODHmXJtaLgA8OOr2o5IEX9PIllEXyx4fIySFRsBL0AGoplqBT4+LvmYjQtl6x28dul6koEQvBN/LbmS1WnJVlHvbWx+HUTI2TLNrcUTmWyShq9CKGJt7pbEFSL3W9TT17ZaazalVEhUz/TosMhz5eteDb/RvcohUbP7NmjW7Kdv22LetYfxeZ6eRptqmgl9A3+Z6eyB2VI70ektVajB9Z8WZox5rXq5qv9DPxEtzjMD9E7f70EhlkRSwE3+jLM0uA8ROnC3pGMxQrnru8M3JH5dgSAbVY3XR+6XWL6ff1sixJ3LN1bdchquqRkBmtHYr2bb2Li2D9/XJ5dcH10AqbzPUMLV5vqnZ+p95t60rpdtWufsmQVzsuJ7dYRflY/Tk1XGZvG2Vsa5u1KN/7wI71TTPb7+3vTPAvV4WPT767J74b2dfR5+0SC8E3muIahUX6jdOL6B6HP/6Dg9phYMf6tnyRvdvWQe6onBI21TdIXrEqSXxXX307Nz41yNKONCLLUq+/PD1I1/1mRpfG1fHQoMIZ1RkWN5Up9/R0tG8t2hmj1pVmVrtFD6MSvg2qnWbYrcpNNIcRTsoL2C1z8fGfzsFfLzauFuoVK5utFkDtNhf5j2rWPLx7E+R00q85r1f7JndUDsZc0w2AsevY6cBJXaI9LY18iy6MheAbfXeyQv2c4nR01sThKv0tF7QsOqHMWPe3AXj7xu6lXu/qcmTasHrpG8VZWTVshzw+fVkn1ePS/k2nF0HuqBxUq5iJmy9oYaseuFN3kRVEVCIZ52IDUTDCaGZXp0p5w+/iDIP2f+o8gyY1g1vXCKPfs1lopvamrC05Yncwpgwenh3WCU9f1smWi0wPK9dx1IiH4Bv47dwGo6x+/HcerAEGd25UYoSg5nKdUblZko1T1OsH5TP0p79moyT14q4WZfSkjgAZf4u91H2tfp1hM7Nw+cPyyiQYzWy8oE6J79XG2bqPzMVz9eAmXfNljx7qffHQbtkMO1FkZxrU/rmvv71Cd01r2h/IPXFJyUVjteDrNd9Rs/aJAbZCqP92SUeMu944EvDaHs2w4IF+yB2Vg5eu6mIaNWiF05G9k/VBICaCX9Fgcdbo5Jv5P70N9/XkpR11p2lOSE+johHC2icGlHjvfJ1pf57EK/+mni0weugZWPP4AOuNdbAz5VVP2W03R04JhlIl0u7io1l4olNkukMUiKiozILVDUXr8pGZgKc+Z/M1A4g2Og1FnHZIus6ipIFyw7FzPqj94ep+0VaF1RSchFW20ORlqMMmb+ttXsvK7lrcNT2amS7QPnFJx6JzY0iXRo7dlkDx7Pc+k+qvejewbIdh3LEQfKNpUyWDZh/NDEoLPPr79rj6bPtli+1g56RxU0fDjCvPamqZTu00QURN4xoVsfGpQY5uKooOKL7QqPabdUOdKuVRxUZjmff+q2S1Uq96bxRrP6xryVGpdjalzuS1uukseKAfpt7bE9kGlUsV374TYTm/VfGNT+1ysVsV0skamdY19tjgDsXvBRzR4wU7pc1r6VxTTtcTYyH4TtPmtdsro4yeEvpUWqHnm20QQrKGevrsJp48LY0c1eioVanwhHVTUTPqpSgqlsvAisdKugH/dIF1BrebiCfl+8uqVbFUSKXiitRmK2vrLKlvFFZRL/WrVUC7+lUNo1+UTPUrz7KfPa7+u9UDj6Yu6sk4RfbgKkroBVVYBTJoiVY/PJ948pKOyEwnNK/lfZF36V8vNH0/itrlZ1y9wu19WqaOlY4FD/Rz9NmJd5yP3/9zrh9m+cbIge3w+uyNptvouVrM+OS2c9Ey5TqZ9b99Sr1fPiMdzWtXQi0Lv+11qljxWpXLY9Vjv0OHR6aZfqaFwaKlUqnTaL3IiO9G9kXdKuVLlIuOSg/jsII97GAWg1+3aunf3elM3pMSEFFNIvqKiH5O/a877yOifCJalvo30csx3ZCWRnh8SEfTmYKeKDbU8dlauSr0pl1lCaPiauqcCKeLp05HKW6wahTidGpsZ/R+ay9ndZy6NathWn45PY1M16cUtFUkjVyfaox+M0XoOzaqhlk2jq3QqPppyExPQ4XMdHx4cw/88nTJ0N7uJtEtsmZ8RvsZ2jXYxuF2yR2VYzpD0XMfO3Wdeh36jQTwtRCiNYCvU8/1OCaE6JL6N9jNgZY9bD6y9ope5yQ3DUXObSWnW45X2jesilf+0BWf3n4uRktst3ezgSvDSzN5Pb644zyp+7ujTyvTfIINNnINnBJk67ogcFu47pyWtUp9F2//0TwJ0E/cJkKGjbKOou6AFnSUzhAA76QevwPgEo/7M8Sq8YRX9M4Bv0cafpKeRsjp1ABdm9aw1S3Lro1GNcibSXCXqZFdpOr8VrWllFBm5OA1Uk6NkuXtNDIpbhARpt7bEy+bFLOzwqvg1xNC7Eg93gnAKB6pAhEtJqIFROTbTcELenf9JMmD3cXWtj7V+VC7wl66qov0mHoWe+fIKLxmhrarmFXJBqPudf8c3tWyNaIat4mJUaBd/aqOR/VqLAWfiGYQ0Uqdf0PU24lCxTSaKzUTQmQD+AOAF4lIN0CWiG5J3RgW79mzp9T7F5hE2ZzuscWdXqj8cE1P3Mu66qfKlwX+doncqodOUXeyGnSGs4xWNVYNNF75Q1fX+y5r3N23VYnn2kzlHj5VC1XQzgr7Wfx2D/++g+7rTWtVxHydQAEjz00bH4uTBYGX5jyWgi+E6C+E6Kjz7wsAu4ioAQCk/t9tsI9tqf83ApgFQHdOIoQYK4TIFkJk16lTWtxlp86XPHbp19o3tFc0ymrVPw6+XL1OUkHy6ODiZC+79VD0eMyiXK+VqMhmpM+jZC9o16gaaEprdDao9yOTK7OboEuT6hjWrTE6NDRfvHd6/etVjQWMEznjhNs8F68unYkArk89vh7AF9oNiKgGEZVPPa4N4DwAq90czOzn7t3WW4y9Xtu5ZjUroZLq5DA6IUf8LroXtV3CvifJqnzqpuiUX5UJAZh2tgob7SDnhSu6YFi3xlj28IVSS16YMXpYJ3x+x3l47vLORWGbbnI59DAa4ctcPwgLt/kGXq+yUQAuJKKfAfRPPQcRZRPRG6ltTgewmIiWA5gJYJQQwpXg36magr6tKfVr1SfUim6qz795QzZ+eXqQ7UQRmV2igkap0qedzQSNjDK+Ctf2KJlNbVQJUSHTZ/9+VOLPrcjOqonnLu+M6hXLSS154RS9In6Av32G48aN52Whn4tsek+CL4TYJ4ToJ4RonXL97E+9vlgIcVPq8TwhxBlCiM6p/8e5PZ66Prm2VnlPm2nbRijxxs9f0Rl929UrcsNceVY0O2N5ob1qveOVqwt92l5aD8rSywcGtsOYq7372NXp9UBJv30YM5nxt/Sw3Ob8iITzRgFZI3AlEENb074s0KdtXYy7wXloayxKK6i59MxGGNq1calRkxe/r0LuqBxcpknK+EvO6Vjx6EWWxaXihLq8bqu6lT2Hs8la3PtTr5YY6GHBVsEsIkcvY1TGaNbMdWMUUKAOCvBS+wiwLp+hNxq0mvnEHSUaR4Y2lBVi9028cGUX/P0Kd13p3ZCWRqhSIVN6784wUeqjyGLc9eEl0cjAadkAhcGdGxaNzBtZ9N59fEjhrKNSiQXDYiez1zUoBaN7nZ5bKS6uJrcoIcQ1fM7hiRPxX73QYFSLmynGqhaLU5wUWStLKAkwM1bvwrmtzEfY152ThYe/WIVHBuuHFlp1crLi6h5NMX/jvlLurCKiHygmnfIpn39Tg+q5SSR2I3wrPrzZ2l8aJdwsvDDRon/7erb9zkO6FDerVrecdNKOUI+cMxrg4Yvb49pzsjztJyrccG4WblAVgXND5VQNobLuunJCmRP8sMML40BNj1Pcm3xsYi2bCQY9R8Mgd1ROCfeR4ts366ZkFyLCH01+F9mlKvzm0cEdpBU5Y0kopky5dPq2q4tMSfHcTrmwfT18tXqX488NVo34gsLrImUcXDgNq1XA9oPHI51G36RmRXRuXA3nBRChM6hjA4yZtaHEa1EfHHWQECrcpl5lNK9dCSsevQjlM9I9z6TiTpkS/DddhCnJ4vkrOuPQ8TzHn/NaEiIM4lBs8IUru+DKsQsiX0PnC4PeyLLRE3cvNVmCQMZvN/2+XhIsKTuUKcEPkyoVMl31VJXZ4NwpXV0ucMdhYfzsFrXKfPVEhnFKsuc3ESAvPzzBd3tos4bOTDyoqtMQmyn7sOCHTGOL+G0/ae6hx+iN52XJMyREqodYQiBMRg+V1xSHiQ98mw8Zvxu7GDFnRB9PNecf+X0H3NuvDX47cUqiVcHz/YP9wzYhFCrEYOEdAJ4Z1gkjJvyIe/uX7kjHOIcFP6E0kZCMUq1iZqhFtmSQ9KiNqHNFdhP0aVsXdapEe4E5LvDZzjAJoIbL+ulRgMVeHiz4AfHUpWeEbQKTYLR9AtrHMByY8Q4LfkD42a2LcUabepXDNiF04pZ5y8iBBZ9JHJyMwyQVFvyAaBvzxskMw8QfFvyAaFOvtOD35wQmhmEChAU/IPRqmbAvmWGYIGHBD5FLz2wUtgkMwyQIFnwmsTx9GYfKMsmCBT8g4lBSOGmEWamUYcKABT9EWG7CpYnHPrIMEze4lk5A6C3acjJWeGx4ahDS+ftnEgaP8EOkRR2O0gkLFnsmibDgMwzDJAQW/IAol85fNcMw4cIqFBBRb6bNMEzZhwWfYRIG15dPLiz4DJMQLmhTBwAwg6uFJhYWfIZJCJkpt2Lc21Iy7uE4fIZJCNf0aIYG1bnxSZJhwWeYhNCnXV30aVc3bDOYEPHk0iGiy4loFREVEFG2yXYDiGgdEa0nopFejskwDMO4w6sPfyWAywDMNtqAiNIBvAJgIID2AIYTUXuPx2UYhmEc4smlI4RYAwCkVyimmO4A1gshNqa2HQ9gCIDVXo7NMAzDOCOIKJ1GALaonm9NvVYKIrqFiBYT0eI9e/YEYBrDMExysBzhE9EMAPV13npICPGFTGOEEGMBjAWA7Oxsrh7MMAwjEUvBF0L093iMbQCaqJ43Tr3GMAzDBEgQLp1FAFoTUXMiKgfgKgATAzguwzAMo8JrWOalRLQVwDkAJhHRtNTrDYloMgAIIfIA3AlgGoA1AD4WQqzyZjbDMAzjFK9ROp8B+Ezn9e0ABqmeTwYw2cuxGIZhGG9wLR2GYZiEwILPMAyTEFjwGYZhEgILPsMwTEJgwQ+QPw9oF7YJDMMkGBb8AKl6GlejZhgmPFjwA+Sqs5qiRZ1KYZvBMExCYcEPkPQ0Qss6lcM2g2GYhMKCHzDNalYM2wSGYRIKC37AVCxf6Me/q2+rkC1hGCZpsOAHTPesmgCAW3u1DNkShmGSBoeNBMz5rWsjd1RO2GYwDJNAeITPMAyTEFjwGYZhEgILPsMwTEJgwWcYhkkILPgMwzAJgQWfYRgmIbDgMwzDJAQWfIZhmIRAQoiwbdCFiH4DsC5sO0yoDWBv2EYYEGXbgGjbx7a5J8r2Jcm2ZkKIOnpvRDnTdp0QIjtsI4wgosVRtS/KtgHRto9tc0+U7WPbCmGXDsMwTEJgwWcYhkkIURb8sWEbYEGU7YuybUC07WPb3BNl+9g2RHjRlmEYhpFLlEf4DMMwjERY8BmGYRJCJAWfiAYQ0ToiWk9EI8O2Rw0RvUlEu4loZdi2aCGiJkQ0k4hWE9EqIronbJsUiKgCEX1PRMtTtj0Wtk1aiCidiH4goi/DtkULEeUS0QoiWkZEi8O2Rw0RVSeiCUS0lojWENE5YdukQERtU9+Z8u8QEd0btl0KRHRf6npYSUQfElEFX48XNR8+EaUD+AnAhQC2AlgEYLgQYnWohqUgogsAHAbwrhCiY9j2qCGiBgAaCCGWElEVAEsAXBKF746ICEAlIcRhIsoEMBfAPUKIBSGbVgQR3Q8gG0BVIcTFYdujhohyAWQLISKXPERE7wCYI4R4g4jKAagohDgQtl1aUtqyDcDZQohNEbCnEQqvg/ZCiGNE9DGAyUKIt/06ZhRH+N0BrBdCbBRCnAQwHsCQkG0qQggxG8D+sO3QQwixQwixNPX4NwBrADQK16pCRCGHU08zU/8iM9ogosYAcgC8EbYtcYKIqgG4AMA4ABBCnIyi2KfoB2BDFMReRQaA04goA0BFANv9PFgUBb8RgC2q51sREdGKE0SUBeBMAAvDtaSYlMtkGYDdAL4SQkTGNgAvAhgBoCBsQwwQAKYT0RIiuiVsY1Q0B7AHwFspd9gbRFQpbKMMuArAh2EboSCE2AbgOQCbAewAcFAIMd3PY0ZR8BmPEFFlAJ8AuFcIcShsexSEEPlCiC4AGgPoTkSRcIkR0cUAdgshloRtiwnnCyG6AhgI4I6UazEKZADoCmCMEOJMAEcARGrdDQBSrqbBAP4vbFsUiKgGCr0XzQE0BFCJiK7x85hRFPxtAJqonjdOvcbYIOUf/wTA+0KIT8O2R4/UlH8mgAFh25LiPACDU37y8QD6EtG/wzWpJKnRIIQQuwF8hkLXZxTYCmCrarY2AYU3gKgxEMBSIcSusA1R0R/AL0KIPUKIUwA+BXCunweMouAvAtCaiJqn7spXAZgYsk2xILUwOg7AGiHE82Hbo4aI6hBR9dTj01C4KL82XKsKEUI8IIRoLITIQuH59o0QwteRlhOIqFJqER4pd8lFACIRJSaE2AlgCxG1Tb3UD0DoQQI6DEeE3DkpNgPoQUQVU9duPxSuu/lG5KplCiHyiOhOANMApAN4UwixKmSziiCiDwH0BlCbiLYCeEQIMS5cq4o4D8C1AFakfOUA8KAQYnKINik0APBOKlIiDcDHQojIhT9GlHoAPivUBGQA+EAIMTVck0pwF4D3UwO0jQBuDNmeEqRukhcC+FPYtqgRQiwkogkAlgLIA/ADfC6zELmwTIZhGMYfoujSYRiGYXyABZ9hGCYhsOAzDMMkBBZ8hmGYhMCCzzAMkxBY8BkGABHVUlVU3ElE21KPDxPRq2HbxzAy4LBMhtFARI8COCyEeC5sWxhGJjzCZxgTiKi3Uh+fiB4loneIaA4RbSKiy4jomVSd+qmpshYgom5E9G2q0Nm0VNlqhgkdFnyGcUZLAH1RWIjr3wBmCiHOAHAMQE5K9P8BYJgQohuANwE8GZaxDKMmcqUVGCbiTBFCnCKiFSgs/aGUOFgBIAtAWwAdAXyVKoWQjsLStwwTOiz4DOOMEwAghCggolOieBGsAIXXEwFYJYSITJs/hlFglw7DyGUdgDpKX1ciyiSiDiHbxDAAWPAZRiqptpzDAIwmouUAlsHnGucMYxcOy2QYhkkIPMJnGIZJCCz4DMMwCYEFn2EYJiGw4DMMwyQEFnyGYZiEwILPMAyTEFjwGYZhEsL/A/dtCXm6BAJfAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "with paddle.no_grad():\n", - " wav = vocoder.infer(mel_input)\n", - "wav = wav.numpy()[0]\n", - "sf.write(f\"syn_audio/{ref_name}\", wav, samplerate=22050)\n", - "librosa.display.waveplot(wav)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ipd.Audio(wav, rate=22050)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/aishell3/vc0/local/train.sh b/examples/aishell3/vc0/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..eb968b5fc732c470328708ef2c1798b5cf77a6f5 --- /dev/null +++ b/examples/aishell3/vc0/local/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device="gpu" \ No newline at end of file diff --git a/examples/aishell3/vc0/local/voice_cloning.sh b/examples/aishell3/vc0/local/voice_cloning.sh new file mode 100755 index 0000000000000000000000000000000000000000..ee96b9e0dbaca0336626936379e3af54fa30c0e3 --- /dev/null +++ b/examples/aishell3/vc0/local/voice_cloning.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +ge2e_params_path=$1 +tacotron2_params_path=$2 +waveflow_params_path=$3 +vc_input=$4 +vc_output=$5 + +python3 ${BIN_DIR}/voice_cloning.py \ + --ge2e_params_path=${ge2e_params_path} \ + --tacotron2_params_path=${tacotron2_params_path} \ + --waveflow_params_path=${waveflow_params_path} \ + --input-dir=${vc_input} \ + --output-dir=${vc_output} \ No newline at end of file diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..485d73bf7bc2d0541c6d29976cffe64282efd8cf --- /dev/null +++ b/examples/aishell3/vc0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=voice_cloning/tacotron2_ge2e +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/aishell3/vc0/run.sh b/examples/aishell3/vc0/run.sh old mode 100644 new mode 100755 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8d3da78135c829b1c95b521dd528c94ac7525a9b --- a/examples/aishell3/vc0/run.sh +++ b/examples/aishell3/vc0/run.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +input=~/datasets/data_aishell3/train +preprocess_path=dump +alignment=./alignment + +# not include ".pdparams" here +ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000 +train_output_path=output +# include ".pdparams" here +ge2e_params_path=${ge2e_ckpt_path}.pdparams +tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams +# pretrained model +# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams +waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams +vc_input=ref_audio +vc_output=syn_audio + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1 +fi + + diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..08a513491419859e999a8872a393880a4b9cb4f3 --- /dev/null +++ b/examples/csmsc/README.md @@ -0,0 +1,11 @@ + +# CSMSC + +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN diff --git a/examples/csmsc/speedyspeech/baker/inference.sh b/examples/csmsc/speedyspeech/baker/inference.sh deleted file mode 100755 index 880a1fd53d3baa46bd5ac62a1c512f0ed6c7aedc..0000000000000000000000000000000000000000 --- a/examples/csmsc/speedyspeech/baker/inference.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -python3 inference.py \ - --inference-dir=exp/default/inference \ - --text=../sentences.txt \ - --output-dir=exp/default/pd_infer_out \ - --phones-dict=dump/phone_id_map.txt \ - --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/speedyspeech/baker/README.md b/examples/csmsc/tts2/README.md similarity index 86% rename from examples/csmsc/speedyspeech/baker/README.md rename to examples/csmsc/tts2/README.md index 0484d484647c6df48ade5a8f22f904e5de366241..4283e8cccb8a9dc8ff01251f68c1507b5331db58 100644 --- a/examples/csmsc/speedyspeech/baker/README.md +++ b/examples/csmsc/tts2/README.md @@ -1,5 +1,4 @@ # Speedyspeech with CSMSC - This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner). ## Dataset @@ -10,12 +9,23 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -## Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +6. inference using static model. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -37,13 +47,12 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 ``` Here's the complete help message. - ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] @@ -81,20 +90,7 @@ optional arguments: 6. `--phones-dict` is the path of the phone vocabulary file. 7. `--tones-dict` is the path of the tone vocabulary file. -## Pretrained Model -Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) - -SpeedySpeech checkpoint contains files listed below. -```text -speedyspeech_nosil_baker_ckpt_0.5 -├── default.yaml # default config used to train speedyspeech -├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech -├── phone_id_map.txt # phone vocabulary file when training speedyspeech -├── snapshot_iter_11400.pdz # model parameters and optimizer states -└── tone_id_map.txt # tone vocabulary file when training speedyspeech -``` - -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash @@ -107,9 +103,9 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] @@ -152,9 +148,9 @@ optional arguments: --device DEVICE device type to use --verbose VERBOSE verbose ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG] @@ -203,21 +199,42 @@ optional arguments: 4. `--output-dir` is the directory to save synthesized audio files. 5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece. 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -6. `--phones-dict` is the path of the phone vocabulary file. -7. `--tones-dict` is the path of the tone vocabulary file. +7. `--phones-dict` is the path of the phone vocabulary file. +8. `--tones-dict` is the path of the tone vocabulary file. + +### Inference +After Synthesize, we will get static models of speedyspeech and pwgan in `${train_output_path}/inference`. +`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for speedyspeech + pwgan synthesize. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} +``` -You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models. +## Pretrained Model +Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip) + +SpeedySpeech checkpoint contains files listed below. +```text +speedyspeech_nosil_baker_ckpt_0.5 +├── default.yaml # default config used to train speedyspeech +├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech +├── phone_id_map.txt # phone vocabulary file when training speedyspeech +├── snapshot_iter_11400.pdz # model parameters and optimizer states +└── tone_id_map.txt # tone vocabulary file when training speedyspeech +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models. ```bash +source path.sh + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e.py \ --speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \ --speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \ --speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --inference-dir=exp/default/inference \ --device="gpu" \ diff --git a/examples/csmsc/speedyspeech/baker/conf/default.yaml b/examples/csmsc/tts2/conf/default.yaml similarity index 100% rename from examples/csmsc/speedyspeech/baker/conf/default.yaml rename to examples/csmsc/tts2/conf/default.yaml diff --git a/examples/csmsc/tts2/local/inference.sh b/examples/csmsc/tts2/local/inference.sh new file mode 100755 index 0000000000000000000000000000000000000000..37e2e55c7b2b054ce02044473d62630f8d315ec1 --- /dev/null +++ b/examples/csmsc/tts2/local/inference.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +train_output_path=$1 + +python3 ${BIN_DIR}/inference.py \ + --inference-dir=${train_output_path}/inference \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/pd_infer_out \ + --phones-dict=dump/phone_id_map.txt \ + --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/speedyspeech/baker/preprocess.sh b/examples/csmsc/tts2/local/preprocess.sh similarity index 88% rename from examples/csmsc/speedyspeech/baker/preprocess.sh rename to examples/csmsc/tts2/local/preprocess.sh index 422caa310a33b87070ccc4f75898f42935f7a3c4..f7f5ea74c742c4edfa1fc7084a14807f8e35fdae 100755 --- a/examples/csmsc/speedyspeech/baker/preprocess.sh +++ b/examples/csmsc/tts2/local/preprocess.sh @@ -1,9 +1,10 @@ #!/bin/bash + stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,17 +12,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=baker \ --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True \ --use-relative-path=True @@ -38,7 +39,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/tone to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy \ @@ -46,7 +47,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy \ @@ -54,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --tones-dict=dump/tone_id_map.txt \ --use-relative-path=True - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy \ diff --git a/examples/csmsc/speedyspeech/baker/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh similarity index 61% rename from examples/csmsc/speedyspeech/baker/synthesize.sh rename to examples/csmsc/tts2/local/synthesize.sh index 7c37c5bd7a7a9bbac2b77a1f68f8fcd593ff796e..418ee02e6fb4da217365294018244b3e2e365c29 100755 --- a/examples/csmsc/speedyspeech/baker/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -1,16 +1,20 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --speedyspeech-config=conf/default.yaml \ - --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --speedyspeech-config=${config_path} \ + --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --speedyspeech-stat=dump/train/feats_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ - --inference-dir=exp/default/inference \ + --output-dir=${train_output_path}/test \ + --inference-dir=${train_output_path}/inference \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ --device="gpu" diff --git a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh similarity index 54% rename from examples/csmsc/speedyspeech/baker/synthesize_e2e.sh rename to examples/csmsc/tts2/local/synthesize_e2e.sh index 4800a0f71d8ac94c9137b403b591f161fe11c8d2..c50fa77659e0f2f975d723a6c16990d6bcf72c92 100755 --- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -1,16 +1,21 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python synthesize_e2e.py \ - --speedyspeech-config=conf/default.yaml \ - --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --speedyspeech-config=${config_path} \ + --speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --speedyspeech-stat=dump/train/feats_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ - --inference-dir=exp/default/inference \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/test_e2e \ + --inference-dir=${train_output_path}/inference \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt diff --git a/examples/csmsc/speedyspeech/baker/run.sh b/examples/csmsc/tts2/local/train.sh similarity index 64% rename from examples/csmsc/speedyspeech/baker/run.sh rename to examples/csmsc/tts2/local/train.sh index 64936ef34da8a165413e090c1b02162c2a7a3cf2..e44c7da5b5c719d3eec9f3350d8b7a921e3faca5 100755 --- a/examples/csmsc/speedyspeech/baker/run.sh +++ b/examples/csmsc/tts2/local/train.sh @@ -1,11 +1,14 @@ #!/bin/bash -python ../train.py \ +config_path=$1 +train_output_path=$2 + +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ diff --git a/examples/csmsc/tts2/path.sh b/examples/csmsc/tts2/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..1a9519f37aaa50fc809effd2bea23383908b95f1 --- /dev/null +++ b/examples/csmsc/tts2/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=speedyspeech +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..200e81929698e59222043953d5cc9a154a2d0a64 --- /dev/null +++ b/examples/csmsc/tts2/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_76.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # inference with static model + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 +fi diff --git a/examples/vctk/fastspeech2/baker/README.md b/examples/csmsc/tts3/README.md similarity index 90% rename from examples/vctk/fastspeech2/baker/README.md rename to examples/csmsc/tts3/README.md index a9f0fc8b25ccf9c22d839ca5ab36d6fd1243f28c..735ef6d1697c2dba40be6926b47befc9cb9cebd7 100644 --- a/examples/vctk/fastspeech2/baker/README.md +++ b/examples/csmsc/tts3/README.md @@ -9,13 +9,22 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -40,11 +49,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] @@ -78,18 +87,7 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) - -FastSpeech2 checkpoint contains files listed below. -```text -fastspeech2_nosil_baker_ckpt_0.4 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_76000.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it. ```bash @@ -102,9 +100,9 @@ pwg_baker_ckpt_0.4 ├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -144,9 +142,9 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -191,18 +189,31 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip) + +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_baker_ckpt_0.4 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_76000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. ```bash +source path.sh + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e.py \ --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/baker/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/baker/conf/default.yaml rename to examples/csmsc/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/baker/preprocess.sh b/examples/csmsc/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/baker/preprocess.sh rename to examples/csmsc/tts3/local/preprocess.sh index dff3e349d22da9f4c0dc35dba402085bb126d8f5..c83d9a9b62524a46d2d45eafcc778daefe72dab9 100755 --- a/examples/vctk/fastspeech2/baker/preprocess.sh +++ b/examples/csmsc/tts3/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=baker \ --rootdir=~/datasets/BZNSYP/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/fastspeech2/baker/simple.lexicon b/examples/csmsc/tts3/local/simple.lexicon similarity index 100% rename from examples/vctk/fastspeech2/baker/simple.lexicon rename to examples/csmsc/tts3/local/simple.lexicon diff --git a/examples/vctk/fastspeech2/baker/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh similarity index 63% rename from examples/vctk/fastspeech2/baker/synthesize.sh rename to examples/csmsc/tts3/local/synthesize.sh index 535ebdba486d3380df459b23e71a956845cc19fc..724afb04a28d0903b31a38db1bdba49aa94b9a78 100755 --- a/examples/vctk/fastspeech2/baker/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -1,14 +1,19 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/baker/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh similarity index 56% rename from examples/vctk/fastspeech2/baker/synthesize_e2e.sh rename to examples/csmsc/tts3/local/synthesize_e2e.sh index a2deec14534a82daecf93e6a3250e66e2bd86b94..8c9755dd0d1115b788ad724871da22c6d791005f 100755 --- a/examples/vctk/fastspeech2/baker/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -1,14 +1,19 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts3/local/train.sh b/examples/csmsc/tts3/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..fbbc9a9de11458c3af6d08609af3fd275b77e0bb --- /dev/null +++ b/examples/csmsc/tts3/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --nprocs=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts3/path.sh b/examples/csmsc/tts3/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..561d01632ba4b63b582ef07e460d31842823f07c --- /dev/null +++ b/examples/csmsc/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..f45ddab060710e7a8545d3eb90d5628f3d76c4bb --- /dev/null +++ b/examples/csmsc/tts3/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + bash ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md b/examples/csmsc/voc1/README.md similarity index 88% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/README.md rename to examples/csmsc/voc1/README.md index a58fd9229a14cf68e9b2bcc2db488321f6a19168..2a7b3185b3407e92a2343144cd57337ae5476dcf 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/README.md +++ b/examples/csmsc/voc1/README.md @@ -1,6 +1,6 @@ # Parallel WaveGAN with CSMSC This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). -## Preprocess the dataset +## Dataset ### Download and Extract the datasaet Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`. @@ -8,12 +8,21 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -30,17 +39,15 @@ dump ├── raw └── feats_stats.npy ``` - The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`. Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. -## Train the model - -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text @@ -86,25 +93,10 @@ benchmark: 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -## Pretrained Models - -Pretrained models can be downloaded here: -1. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script. - -Parallel WaveGAN checkpoint contains files listed below. - -```text -pwg_baker_ckpt_0.4 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan -``` - -## Synthesize - -`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] @@ -127,10 +119,21 @@ optional arguments: ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. -2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +## Pretrained Models +Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip). + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_baker_ckpt_0.4 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/conf/default.yaml rename to examples/csmsc/voc1/conf/default.yaml diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh similarity index 83% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh rename to examples/csmsc/voc1/local/preprocess.sh index df5b7d22e2fd14fca4cd694bf5e8f1328ac8d0f6..61d6d62bef566d385c4d3d2407ce437ec6d8e9ad 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/preprocess.sh +++ b/examples/csmsc/voc1/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./baker_alignment_tone \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features echo "Extract features ..." - python3 ../../preprocess.py \ + python3 ${BIN_DIR}/../preprocess.py \ --rootdir=~/datasets/BZNSYP/ \ --dataset=baker \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --cut-sil=True \ --num-cpu=20 fi @@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/csmsc/voc1/local/synthesize.sh b/examples/csmsc/voc1/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36 --- /dev/null +++ b/examples/csmsc/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh b/examples/csmsc/voc1/local/train.sh similarity index 60% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh rename to examples/csmsc/voc1/local/train.sh index df8cefd88eb0c331bc58516a760440255275353e..1ef860c36a527aa361dc3a081febae20557e34c7 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/run.sh +++ b/examples/csmsc/voc1/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 + FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ../train.py \ +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 diff --git a/examples/csmsc/voc1/path.sh b/examples/csmsc/voc1/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..28d39ae0078daefa770c0b9c27c4bafe6e40f254 --- /dev/null +++ b/examples/csmsc/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..16309543948c1a4de048e977639ddde86c4769b2 --- /dev/null +++ b/examples/csmsc/voc1/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/ljspeech/README.md b/examples/ljspeech/README.md index db87b149a36c567a0bffbf70ec26600ab91a5768..67b1bf473152de72d6627d26d017b8473a4c642c 100644 --- a/examples/ljspeech/README.md +++ b/examples/ljspeech/README.md @@ -2,5 +2,10 @@ # LJSpeech * tts0 - Tactron2 -* tts1 - TransformerTTS -* voc0 - WaveFlow +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e95f6614d75c90ad80ad49fd8c776a67211f68c2 --- /dev/null +++ b/examples/ljspeech/tts0/README.md @@ -0,0 +1,87 @@ +# Tacotron2 with LJSpeech +PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884). + +## Dataset +We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). + +```bash +wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +tar xjvf LJSpeech-1.1.tar.bz2 +``` +## Get Started +Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize mels. +```bash +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} +``` +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} +``` +Here's the complete help message. +```text +usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR] + [--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}] + [--nprocs NPROCS] [--opts ...] + +optional arguments: + -h, --help show this help message and exit + --config FILE path of the config file to overwrite to default config + with. + --data DATA_DIR path to the datatset. + --output OUTPUT_DIR path to save checkpoint and logs. + --checkpoint_path CHECKPOINT_PATH + path of the checkpoint to load + --device {cpu,gpu} device type to use, cpu and gpu are supported. + --nprocs NPROCS number of parallel processes to use. + --opts ... options to overwrite --config file and the default + config, passing in KEY VALUE pairs +``` + +If you want to train on CPU, just set ``--device=cpu``. +If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU. +By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. +And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load. +**Note: The checkpoint path cannot contain the file extension.** + +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} +``` +```text +usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH] + [--input INPUT] [--output OUTPUT] [--device DEVICE] + [--opts ...] [-v] + +generate mel spectrogram with TransformerTTS. + +optional arguments: + -h, --help show this help message and exit + --config FILE extra config to overwrite the default config + --checkpoint_path CHECKPOINT_PATH + path of the checkpoint to load. + --input INPUT path of the text sentences + --output OUTPUT path to save outputs + --device DEVICE device type to use. + --opts ... options to overwrite --config file and the default + config, passing in KEY VALUE pairs + -v, --verbose print msg +``` +**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example) + +## Pretrained Models +Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. + +1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) + +2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip) diff --git a/examples/ljspeech/tts0/local/preprocess.sh b/examples/ljspeech/tts0/local/preprocess.sh new file mode 100755 index 0000000000000000000000000000000000000000..c39a3172d044e46c11a7aee53ae51a18d5092fee --- /dev/null +++ b/examples/ljspeech/tts0/local/preprocess.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +preprocess_path=$1 + +python3 ${BIN_DIR}/preprocess.py \ + --input=~/datasets/LJSpeech-1.1 \ + --output=${preprocess_path} \ + -v \ \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/synthesize.sh b/examples/ljspeech/tts0/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..91c89dd49cbba3e2095b07e9c678f1e463ed2e12 --- /dev/null +++ b/examples/ljspeech/tts0/local/synthesize.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +train_output_path=$1 +ckpt_name=$2 + +python3 ${BIN_DIR}/synthesize.py \ + --config=${train_output_path}/config.yaml \ + --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ + --input=${BIN_DIR}/../sentences_en.txt \ + --output=${train_output_path}/test + --device=gpu \ No newline at end of file diff --git a/examples/ljspeech/tts0/local/tacotron2/README.md b/examples/ljspeech/tts0/local/tacotron2/README.md deleted file mode 100644 index e5f159df9af3f362f618b2052356dc44f6a0da76..0000000000000000000000000000000000000000 --- a/examples/ljspeech/tts0/local/tacotron2/README.md +++ /dev/null @@ -1,92 +0,0 @@ -# Tacotron2 - -PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884). - -## Project Structure - -```text -├── config.py # default configuration file -├── ljspeech.py # dataset and dataloader settings for LJSpeech -├── preprocess.py # script to preprocess LJSpeech dataset -├── synthesize.py # script to synthesize spectrogram from text -├── train.py # script for tacotron2 model training -├── synthesize.ipynb # notebook example for end-to-end TTS -``` - -## Dataset - -We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). - -```bash -wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -tar xjvf LJSpeech-1.1.tar.bz2 -``` - -Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory. - -```bash -python preprocess.py \ ---input=${DATAPATH} \ ---output=${PREPROCESSEDDATAPATH} \ --v \ -``` - -For more help on arguments - -``python preprocess.py --help``. - -## Train the model - -Tacotron2 model can be trained by running ``train.py``. - -```bash -python train.py \ ---data=${PREPROCESSEDDATAPATH} \ ---output=${OUTPUTPATH} \ ---device=gpu \ -``` - -If you want to train on CPU, just set ``--device=cpu``. -If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU. -By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load. - -**Note: The checkpoint path cannot contain the file extension.** - -For more help on arguments - -``python train_transformer.py --help``. - -## Synthesize - -After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``. - -```bash -python synthesize.py \ ---config=${CONFIGPATH} \ ---checkpoint_path=${CHECKPOINTPATH} \ ---input=${TEXTPATH} \ ---output=${OUTPUTPATH} ---device=gpu -``` - -The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``. - -For more help on arguments - -``python synthesize.py --help``. - -Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files. - - -## Pretrained Models - -Pretrained Models can be downloaded from links below. We provide 2 models with different configurations. - -1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip) - -2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip) - - -## Notebook: End-to-end TTS - -See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow. diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..b8bcf5cb92a94e0bd6a62d03f00b232cc1ce3494 --- /dev/null +++ b/examples/ljspeech/tts0/local/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device=gpu \ \ No newline at end of file diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..590e7a9c97f7d5baf59d2c8ca9a2ffed6df820ed --- /dev/null +++ b/examples/ljspeech/tts0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=tacotron2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/ljspeech/tts0/run.sh b/examples/ljspeech/tts0/run.sh old mode 100644 new mode 100755 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..1da80c962516739b251785a8811337121e8f439f --- a/examples/ljspeech/tts0/run.sh +++ b/examples/ljspeech/tts0/run.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +preprocess_path=preprocessed_ljspeech +train_output_path=output +ckpt_name=step-35000 + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${preprocess_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1 +fi + diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md b/examples/ljspeech/tts1/README.md similarity index 88% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md rename to examples/ljspeech/tts1/README.md index b6b2ac9a594ee06ff356bcf5a62d1604ba50fc6d..097dc08c1cf6cfb1ceaf09d38744c12f05dbd064 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/README.md +++ b/examples/ljspeech/tts1/README.md @@ -8,12 +8,21 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 ```bash tar xjvf LJSpeech-1.1.tar.bz2 ``` -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh. +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. ```text @@ -35,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. ```text @@ -71,17 +80,6 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model -Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) - -TransformerTTS checkpoint contains files listed below. -```text -transformer_tts_ljspeech_ckpt_0.4 -├── default.yaml # default config used to train transformer_tts -├── phone_id_map.txt # phone vocabulary file when training transformer_tts -├── snapshot_iter_201500.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training transformer_tts -``` ## Synthesize We use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder. Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it. @@ -94,9 +92,9 @@ waveflow_ljspeech_ckpt_0.3 ├── config.yaml # default config used to train waveflow └── step-2000000.pdparams # model parameters of waveflow ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] @@ -132,9 +130,9 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize_e2e.py [-h] @@ -177,17 +175,30 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences.txt` using pretrained transformer_tts and waveflow models. +## Pretrained Model +Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip) + +TransformerTTS checkpoint contains files listed below. +```text +transformer_tts_ljspeech_ckpt_0.4 +├── default.yaml # default config used to train transformer_tts +├── phone_id_map.txt # phone vocabulary file when training transformer_tts +├── snapshot_iter_201500.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training transformer_tts +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained transformer_tts and waveflow models. ```bash +source path.sh + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e.py \ --transformer-tts-config=transformer_tts_ljspeech_ckpt_0.4/default.yaml \ --transformer-tts-checkpoint=transformer_tts_ljspeech_ckpt_0.4/snapshot_iter_201500.pdz \ --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \ --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ - --text=../sentences.txt \ + --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/conf/default.yaml rename to examples/ljspeech/tts1/conf/default.yaml diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh b/examples/ljspeech/tts1/local/preprocess.sh similarity index 89% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh rename to examples/ljspeech/tts1/local/preprocess.sh index 7fc5247bd4ea79bf3d7264e4a40d05c64e4ebb4f..e1acc8e83da392d679ca4032b7395cbf6485c1e4 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/preprocess.sh +++ b/examples/ljspeech/tts1/local/preprocess.sh @@ -3,12 +3,12 @@ stage=1 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=ljspeech \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dumpdir=dump \ @@ -27,21 +27,21 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh b/examples/ljspeech/tts1/local/synthesize.sh similarity index 61% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh rename to examples/ljspeech/tts1/local/synthesize.sh index 164e5ba236b7ec879f2f48a7a3c67ac5e3864f81..5d1c9534a822a8c3754ba00167adb7b055058cba 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize.sh +++ b/examples/ljspeech/tts1/local/synthesize.sh @@ -1,13 +1,18 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --transformer-tts-config=conf/default.yaml \ - --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --transformer-tts-config=${config_path} \ + --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --transformer-tts-stat=dump/train/speech_stats.npy \ --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh similarity index 53% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh rename to examples/ljspeech/tts1/local/synthesize_e2e.sh index 4fb6923847d352ea6000f91b145a5ca2a5877c99..333a5cd6b2dff69f496a2450fafdd75b3daf839e 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.sh +++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh @@ -1,13 +1,18 @@ #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --transformer-tts-config=conf/default.yaml \ - --transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \ +python3 ${BIN_DIR}/synthesize_e2e.py \ + --transformer-tts-config=${config_path} \ + --transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --transformer-tts-stat=dump/train/speech_stats.npy \ --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \ --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh b/examples/ljspeech/tts1/local/train.sh similarity index 55% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh rename to examples/ljspeech/tts1/local/train.sh index f448bdfc1219ceb5eb7ec937b16ff13043e5fa0e..8527f57f3d7e9a44a288bc1997f476bdd46e3e6a 100755 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/run.sh +++ b/examples/ljspeech/tts1/local/train.sh @@ -1,9 +1,12 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts1/local/transformer_tts/sentences.txt b/examples/ljspeech/tts1/local/transformer_tts/sentences.txt deleted file mode 100644 index 36b73a528461513938d6ad7dfec7987259799ccd..0000000000000000000000000000000000000000 --- a/examples/ljspeech/tts1/local/transformer_tts/sentences.txt +++ /dev/null @@ -1,9 +0,0 @@ -001 Life was like a box of chocolates, you never know what you're gonna get. -002 With great power there must come great responsibility. -003 To be or not to be, that’s a question. -004 A man can be destroyed but not defeated -005 Do not, for one repulse, give up the purpose that you resolved to effort. -006 Death is just a part of life, something we're all destined to do. -007 I think it's hard winning a war with words. -008 Don’t argue with the people of strong determination, because they may change the fact! -009 Love you three thousand times. \ No newline at end of file diff --git a/examples/ljspeech/tts1/path.sh b/examples/ljspeech/tts1/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..201261b168f2932ffd64c842aa5140d554715195 --- /dev/null +++ b/examples/ljspeech/tts1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=transformer_tts +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/ljspeech/tts1/run.sh b/examples/ljspeech/tts1/run.sh old mode 100644 new mode 100755 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6e7a60607190c438d023b20d794cbf7317acdab7 --- a/examples/ljspeech/tts1/run.sh +++ b/examples/ljspeech/tts1/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_403.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/fastspeech2/ljspeech/README.md b/examples/ljspeech/tts3/README.md similarity index 85% rename from examples/vctk/fastspeech2/ljspeech/README.md rename to examples/ljspeech/tts3/README.md index ed905bea3dbede68bdf731c32fd39a9912c8b862..f5bea6a90645bf4d7c780bc80ccd4572f2b2a014 100644 --- a/examples/vctk/fastspeech2/ljspeech/README.md +++ b/examples/ljspeech/tts3/README.md @@ -9,13 +9,22 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. +```bash +./run.sh +``` +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -40,10 +49,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. ```text @@ -78,18 +87,7 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) - -FastSpeech2 checkpoint contains files listed below. -```text -fastspeech2_nosil_ljspeech_ckpt_0.5 -├── default.yaml # default config used to train fastspeech2 -├── phone_id_map.txt # phone vocabulary file when training fastspeech2 -├── snapshot_iter_100000.pdz # model parameters and optimizer states -└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 -``` -## Synthesize +### Synthesize We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/ljspeech/) as the neural vocoder. Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it. ```bash @@ -102,9 +100,9 @@ pwg_ljspeech_ckpt_0.5 ├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan └── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -144,19 +142,19 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] + [--text TEXT] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -191,18 +189,31 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models. +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) + +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_ljspeech_ckpt_0.5 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_100000.pdz # model parameters and optimizer states +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models. ```bash +source path.sh + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ +python3 ${BIN_DIR}/synthesize_e2e_en.py \ --fastspeech2-config=fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \ --fastspeech2-checkpoint=fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \ --fastspeech2-stat=fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \ --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ - --text=../sentences_en.txt \ + --text=${BIN_DIR}/../sentences_en.txt \ --output-dir=exp/default/test_e2e \ --device="gpu" \ --phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/ljspeech/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/ljspeech/conf/default.yaml rename to examples/ljspeech/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/ljspeech/preprocess.sh b/examples/ljspeech/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/ljspeech/preprocess.sh rename to examples/ljspeech/tts3/local/preprocess.sh index ff2e765d7bd5c0b52397e35f03a95e4401da681b..749a9884c67134e712463752e81d9e7b7c4e5a4f 100755 --- a/examples/vctk/fastspeech2/ljspeech/preprocess.sh +++ b/examples/ljspeech/tts3/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./ljspeech_alignment \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=ljspeech \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=8 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/fastspeech2/ljspeech/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh similarity index 64% rename from examples/vctk/fastspeech2/ljspeech/synthesize.sh rename to examples/ljspeech/tts3/local/synthesize.sh index 0f8225e4448abe4538242a36d567f2be4357a6e5..32dcde58628f92e746ca3789bcb4834e7f35a36d 100755 --- a/examples/vctk/fastspeech2/ljspeech/synthesize.sh +++ b/examples/ljspeech/tts3/local/synthesize.sh @@ -1,15 +1,19 @@ - #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ + --output-dir=${train_output_path}/test \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh similarity index 56% rename from examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh rename to examples/ljspeech/tts3/local/synthesize_e2e.sh index 158d4483ca7062abd64f057481ec0bef0b693921..28ea3a8fac3bf6f328bbe2980a455afc28d2b675 100755 --- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.sh +++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh @@ -1,15 +1,19 @@ - #!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \ +python3 ${BIN_DIR}/synthesize_e2e_en.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ --fastspeech2-stat=dump/train/speech_stats.npy \ --pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \ --pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \ --pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \ - --text=../sentences_en.txt \ - --output-dir=exp/default/test_e2e \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output-dir=${train_output_path}/test_e2e \ --device="gpu" \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/vctk/fastspeech2/ljspeech/run.sh b/examples/ljspeech/tts3/local/train.sh similarity index 55% rename from examples/vctk/fastspeech2/ljspeech/run.sh rename to examples/ljspeech/tts3/local/train.sh index fd5e2c689e0dd32d6d383c5fd528030f651b5158..847a44e3c0e4407876d5c856a0253642bda541e7 100755 --- a/examples/vctk/fastspeech2/ljspeech/run.sh +++ b/examples/ljspeech/tts3/local/train.sh @@ -1,9 +1,12 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 \ --phones-dict=dump/phone_id_map.txt diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..561d01632ba4b63b582ef07e460d31842823f07c --- /dev/null +++ b/examples/ljspeech/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..143debd2a4c05fe825f739faf9a8496c8a686c9b --- /dev/null +++ b/examples/ljspeech/tts3/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_201.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6163ae42f7a5e03f5fa60e04a9f754f8a1a5c932 --- /dev/null +++ b/examples/ljspeech/voc0/README.md @@ -0,0 +1,52 @@ +# WaveFlow with LJSpeech +## Dataset +### Download the datasaet. +```bash +wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 +``` +### Extract the dataset. +```bash +tar xjvf LJSpeech-1.1.tar.bz2 +``` +## Get Started +Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. +Assume the path to the Tacotron2 generated mels is `../tts0/output/test`. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs from mels. +```bash +./run.sh +``` +### Preprocess the dataset. +```bash +./local/preprocess.sh ${preprocess_path} +``` +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} +``` +The training script requires 4 command line arguments. +1. `--data` is the path of the training dataset. +2. `--output` is the path of the output directory. +3. `--device` should be "cpu" or "gpu" +4. `--nprocs` is the number of processes to train the model in parallel. + +If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet. + +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels. +```bash +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} +``` + +Synthesize waveform. +1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. +2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does. +3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here. +4. `--device` specifies to device to run synthesis on. + +## Pretrained Model +Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc0/local/preprocess.sh b/examples/ljspeech/voc0/local/preprocess.sh new file mode 100755 index 0000000000000000000000000000000000000000..4a45793e676d3457549b00fa0cf6ac9a1577d1a7 --- /dev/null +++ b/examples/ljspeech/voc0/local/preprocess.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +preprocess_path=$1 + +python3 ${BIN_DIR}/preprocess.py \ + --input=~/datasets/LJSpeech-1.1 \ + --output=${preprocess_path} \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/synthesize.sh b/examples/ljspeech/voc0/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..055542cf987734d9e3ec45d04f7e2d2fa6dd1085 --- /dev/null +++ b/examples/ljspeech/voc0/local/synthesize.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +input_mel_path=$1 +train_output_path=$2 +ckpt_name=$3 + +python ${BIN_DIR}/synthesize.py \ + --input=${input_mel_path} \ + --output=${train_output_path}/wavs/ \ + --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ + --device="gpu" \ + --verbose \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/train.sh b/examples/ljspeech/voc0/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..5c4defd9b318ebe5454686cfa55853454935cbd9 --- /dev/null +++ b/examples/ljspeech/voc0/local/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device="gpu" \ + --nprocs=1 \ No newline at end of file diff --git a/examples/ljspeech/voc0/local/waveflow/README.md b/examples/ljspeech/voc0/local/waveflow/README.md deleted file mode 100644 index b3be1e4a3b4f032dedb60bb197e86dc4d2c6ea2c..0000000000000000000000000000000000000000 --- a/examples/ljspeech/voc0/local/waveflow/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# WaveFlow with LJSpeech - -## Dataset - -### Download the datasaet. - -```bash -wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -``` - -### Extract the dataset. - -```bash -tar xjvf LJSpeech-1.1.tar.bz2 -``` - -### Preprocess the dataset. - -Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset. - -```bash -python preprocess.py --input=LJSpeech-1.1/ --output=ljspeech_waveflow -``` - -## Train the model - -The training script requires 4 command line arguments. -`--data` is the path of the training dataset, `--output` is the path of the output directory (we recommend to use a subdirectory in `runs` to manage different experiments.) - -`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel. - -```bash -python train.py --data=ljspeech_waveflow/ --output=runs/test --device="gpu" --nprocs=1 -``` - -If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet. - -## Synthesize - -Synthesize waveform. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does. - -`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here. - -`--device` specifies to device to run synthesis on. - -```bash -python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-2000000' --device="gpu" --verbose -``` - -## Pretrained Model - -Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip). diff --git a/examples/ljspeech/voc0/path.sh b/examples/ljspeech/voc0/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..b9fe83ecc96368cbc948f1265cb204b68b276c31 --- /dev/null +++ b/examples/ljspeech/voc0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=waveflow +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc0/run.sh b/examples/ljspeech/voc0/run.sh old mode 100644 new mode 100755 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a4f1ac389cf244cc68609e809ff6713b22f0f06b --- a/examples/ljspeech/voc0/run.sh +++ b/examples/ljspeech/voc0/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +preprocess_path=preprocessed_ljspeech +train_output_path=output +# mel generated by Tacotron2 +input_mel_path=../tts0/output/test +ckpt_name=step-10000 + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${preprocess_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md b/examples/ljspeech/voc1/README.md similarity index 87% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md rename to examples/ljspeech/voc1/README.md index 5b54ef5ad8401b4380a473a6cb5405bd3b8aa7be..995b4c7c6f7426d9b2581a23c9897a9b37e4db7e 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/README.md +++ b/examples/ljspeech/voc1/README.md @@ -1,22 +1,28 @@ -# Parallel WaveGAN with the LJSpeech-1.1 dataset - +# Parallel WaveGAN with the LJSpeech-1.1 This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/). - -## Preprocess the dataset - +## Dataset ### Download and Extract the datasaet Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/). - ### Get MFA results for silence trim We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio. You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/LJSpeech-1.1`. Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. +```bash +./run.sh +``` + +### Preprocess the dataset ```bash -./preprocess.sh +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -38,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` Here's the complete help message. @@ -88,23 +94,10 @@ benchmark: 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. -## Pretrained Models -Pretrained models can be downloaded here: -1. Parallel WaveGAN checkpoint. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip), which is used as a vocoder in the end-to-end inference script. - -Parallel WaveGAN checkpoint contains files listed below. - -```text -pwg_ljspeech_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan -``` - -## Synthesize -`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] @@ -127,10 +120,21 @@ optional arguments: ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. -2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`. +2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +## Pretrained Models +Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_ljspeech_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/conf/default.yaml rename to examples/ljspeech/voc1/conf/default.yaml diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh similarity index 84% rename from examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh rename to examples/ljspeech/voc1/local/preprocess.sh index d88d2989c8860e1bf4e06adea59375f896816f08..d1af60dad6a6aa4ed3a89a60bee1fa266175d450 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/preprocess.sh +++ b/examples/ljspeech/voc1/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./ljspeech_alignment \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../../preprocess.py \ + python3 ${BIN_DIR}/../preprocess.py \ --rootdir=~/datasets/LJSpeech-1.1/ \ --dataset=ljspeech \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --cut-sil=True \ --num-cpu=20 fi @@ -39,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36 --- /dev/null +++ b/examples/ljspeech/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh b/examples/ljspeech/voc1/local/train.sh similarity index 60% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh rename to examples/ljspeech/voc1/local/train.sh index df8cefd88eb0c331bc58516a760440255275353e..1ef860c36a527aa361dc3a081febae20557e34c7 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/run.sh +++ b/examples/ljspeech/voc1/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 + FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ../train.py \ +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..28d39ae0078daefa770c0b9c27c4bafe6e40f254 --- /dev/null +++ b/examples/ljspeech/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/ljspeech/voc1/run.sh b/examples/ljspeech/voc1/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..16309543948c1a4de048e977639ddde86c4769b2 --- /dev/null +++ b/examples/ljspeech/voc1/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/voxceleb/spk0/local/ge2e/README.md b/examples/other/ge2e/README.md similarity index 55% rename from examples/voxceleb/spk0/local/ge2e/README.md rename to examples/other/ge2e/README.md index b05786a16f77b23001031957a79471537e46ab32..89365d6351a4f58d502dd63211462338ad9a5cbc 100644 --- a/examples/voxceleb/spk0/local/ge2e/README.md +++ b/examples/other/ge2e/README.md @@ -1,97 +1,78 @@ # Speaker Encoder - This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [tacotron2_aishell3](../tacotron2_shell3). The trained speaker encoder is used to extract utterance embeddings from utterances. - ## Model - The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used. -## File Structure - -```text -ge2e -├── README.md -├── README_cn.md -├── audio_processor.py -├── config.py -├── dataset_processors.py -├── inference.py -├── preprocess.py -├── random_cycle.py -├── speaker_verification_dataset.py -└── train.py -``` - ## Download Datasets - Currently supported datasets are Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata, which can be downloaded from corresponding webpage. 1. Librispeech/train-other-500 - An English multispeaker dataset,[URL](https://www.openslr.org/resources/12/train-other-500.tar.gz),only the `train-other-500` subset is used. - 2. VoxCeleb1 - An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev D should be downloaded, combined and extracted. - 3. VoxCeleb2 - An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev H should be downloaded, combined and extracted. - 4. Aidatatang-200zh - A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/62/) . - 5. magicdata - A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/68/) . If you want to use other datasets, you can also download and preprocess it as long as it meets the requirements described below. -## Preprocess Datasets +## Get Started +```bash +./run.sh +``` + +### Preprocess Datasets +`./local/preprocess.sh` calls `${BIN_DIR}/preprocess.py`. +```bash +./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} +``` +Assume datasets_root is `~/datasets/GE2E`, and it has the follow structure(We only use `train-other-500` for simplicity): +```Text +GE2E +├── LibriSpeech +└── (other datasets) +``` Multispeaker datasets are used as training data, though the transcriptions are not used. To enlarge the amount of data used for training, several multispeaker datasets are combined. The preporcessed datasets are organized in a file structure described below. The mel spectrogram of each utterance is save in `.npy` format. The dataset is 2-stratified (speaker-utterance). Since multiple datasets are combined, to avoid conflict in speaker id, dataset name is prepended to the speake ids. ```text dataset_root ├── dataset01_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy +│ ├── utterance01.npy +│ ├── utterance02.npy +│ └── utterance03.npy ├── dataset01_speaker02/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy +│ ├── utterance01.npy +│ ├── utterance02.npy +│ └── utterance03.npy ├── dataset02_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy +│ ├── utterance01.npy +│ ├── utterance02.npy +│ └── utterance03.npy └── dataset02_speaker02/ -    ├── utterance01.npy -    ├── utterance02.npy -    └── utterance03.npy + ├── utterance01.npy + ├── utterance02.npy + └── utterance03.npy ``` +In `${BIN_DIR}/preprocess.py`: +1. `--datasets_root` is the directory that contains several extracted dataset +2. `--output_dir` is the directory to save the preprocessed dataset +3. `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata. -Run the command to preprocess datasets. - +### Train the model +`./local/train.sh` calls `${BIN_DIR}/train.py`. ```bash -python preprocess.py --datasets_root= --output_dir= --dataset_names= +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} ``` - -Here `--datasets_root` is the directory that contains several extracted dataset; `--output_dir` is the directory to save the preprocessed dataset; `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata. - -## Training - -When preprocessing is done, run the command below to train the mdoel. - -```bash -python train.py --data= --output= --device="gpu" --nprocs=1 -``` - -- `--data` is the path to the preprocessed dataset. -- `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training. -- `--device` is the device type to run the training, 'cpu' and 'gpu' are supported. -- `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda. +In `${BIN_DIR}/train.py`: +1. `--data` is the path to the preprocessed dataset. +2. `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training. +3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported. +4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. +5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda. Other options are described below. @@ -99,29 +80,23 @@ Other options are described below. - `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces. - `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory. -## Pretrained Model - -The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. - -Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip). - -## Inference - +### Inference When training is done, run the command below to generate utterance embedding for each utterance in a dataset. - +`./local/inference.sh` calls `${BIN_DIR}/inference.py`. ```bash -python inference.py --input= --output= --checkpoint_path= --device="gpu" +CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} ``` +In `${BIN_DIR}/inference.py`: +1. `--input` is the path of the dataset used for inference. +2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format. +3. `--checkpoint_path` is the path of the checkpoint to use, extension not included. +4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`. +5. `--device` and `--opts` have the same meaning as in the training script. -`--input` is the path of the dataset used for inference. - -`--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format. - -`--checkpoint_path` is the path of the checkpoint to use, extension not included. - -`--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`. +## Pretrained Model +The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps. -`--device` and `--opts` have the same meaning as in the training script. +Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip). ## References diff --git a/examples/other/ge2e/local/inference.sh b/examples/other/ge2e/local/inference.sh new file mode 100755 index 0000000000000000000000000000000000000000..1beebdfaa144ff62bcf322a6dc22361a5f51d349 --- /dev/null +++ b/examples/other/ge2e/local/inference.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +#generate utterance embedding for each utterance in a dataset. +infer_input=$1 +infer_output=$2 +train_output_path=$3 +ckpt_name=$4 + +python3 ${BIN_DIR}/inference.py \ + --input=${infer_input} \ + --output=${infer_output} \ + --checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \ + --device="gpu" + diff --git a/examples/other/ge2e/local/preprocess.sh b/examples/other/ge2e/local/preprocess.sh new file mode 100755 index 0000000000000000000000000000000000000000..9851596b58074944710d66d51dbc1060325b7301 --- /dev/null +++ b/examples/other/ge2e/local/preprocess.sh @@ -0,0 +1,9 @@ +#!/bin/bash +datasets_root=$1 +preprocess_path=$2 +dataset_names=$3 + +python3 ${BIN_DIR}/preprocess.py \ + --datasets_root=${datasets_root} \ + --output_dir=${preprocess_path} \ + --dataset_names=${dataset_names} \ No newline at end of file diff --git a/examples/other/ge2e/local/train.sh b/examples/other/ge2e/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..5c4defd9b318ebe5454686cfa55853454935cbd9 --- /dev/null +++ b/examples/other/ge2e/local/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +preprocess_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --data=${preprocess_path} \ + --output=${train_output_path} \ + --device="gpu" \ + --nprocs=1 \ No newline at end of file diff --git a/examples/other/ge2e/path.sh b/examples/other/ge2e/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..4333199cb917b31f49886c3fd81324249383661d --- /dev/null +++ b/examples/other/ge2e/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=ge2e +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/other/ge2e/run.sh b/examples/other/ge2e/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..d7954bd2fbfb69e8ad6196c61b43b34ac3c49e1a --- /dev/null +++ b/examples/other/ge2e/run.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +datasets_root=~/datasets/GE2E +preprocess_path=dump +dataset_names=librispeech_other +train_output_path=output +infer_input=infer_input +infer_output=infer_output +ckpt_name=step-10000 + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/other/punctuation_restoration/README.md b/examples/other/punctuation_restoration/README.md index 6393d8f5bab71f3385c2384331671226e88a837b..3454823d34c7760f86337c2107ff15c0a42bad98 100644 --- a/examples/other/punctuation_restoration/README.md +++ b/examples/other/punctuation_restoration/README.md @@ -1,4 +1,3 @@ # Punctation Restoration Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task. - diff --git a/examples/other/text_frontend/get_g2p_data.py b/examples/other/text_frontend/get_g2p_data.py index 78535b66a98c91f985dd4b4e2d293e14f2d7116e..61ef3d0988d8683578375928f6b29ac0e45313d3 100644 --- a/examples/other/text_frontend/get_g2p_data.py +++ b/examples/other/text_frontend/get_g2p_data.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from collections import defaultdict from pathlib import Path diff --git a/examples/other/text_frontend/get_textnorm_data.py b/examples/other/text_frontend/get_textnorm_data.py index 8058e05841da222c8d70072ce93b11ca06fb05b3..3928e67c5c8f2ad09fd44d210b87c7b3da1fd6bb 100644 --- a/examples/other/text_frontend/get_textnorm_data.py +++ b/examples/other/text_frontend/get_textnorm_data.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path diff --git a/examples/other/text_frontend/test_g2p.py b/examples/other/text_frontend/test_g2p.py index 0515e9940efeb5030609e185decf650cd0579742..15005a003729bb6329d26f74028fc03fd8df4427 100644 --- a/examples/other/text_frontend/test_g2p.py +++ b/examples/other/text_frontend/test_g2p.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import re from pathlib import Path diff --git a/examples/other/text_frontend/test_textnorm.py b/examples/other/text_frontend/test_textnorm.py index 99eed290a6d38d7deb79e053e9e6e0ecf6d40fea..22f90f8748d9136e4d5fb6788bd7c72144cc971c 100644 --- a/examples/other/text_frontend/test_textnorm.py +++ b/examples/other/text_frontend/test_textnorm.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import re from pathlib import Path diff --git a/examples/other/use_mfa/local/detect_oov.py b/examples/other/use_mfa/local/detect_oov.py index f5ae728f761cb7a6b670b94c86f5e64be64c0491..4928e45346820f578c65976c7ec0e232fc41c990 100644 --- a/examples/other/use_mfa/local/detect_oov.py +++ b/examples/other/use_mfa/local/detect_oov.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse +import logging from collections import OrderedDict from pathlib import Path -import logging def detect_oov(corpus_dir, lexicon_path, transcription_pattern="*.lab"): diff --git a/examples/other/use_mfa/local/generate_lexicon.py b/examples/other/use_mfa/local/generate_lexicon.py index b6e594ab8ccde1acd3df503ac328bd1b36f5a819..e9445665bad45258b55978c23e912e42cdc7b575 100644 --- a/examples/other/use_mfa/local/generate_lexicon.py +++ b/examples/other/use_mfa/local/generate_lexicon.py @@ -20,9 +20,8 @@ than words are used in transcriptions produced by `reorganize_baker.py`. We make this choice to better leverage other software for chinese text to pinyin tools like pypinyin. This is the convention for G2P in Chinese. """ - -import re import argparse +import re from collections import OrderedDict INITIALS = [ diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh b/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh deleted file mode 100755 index e95b0da8fd9c6cf6ec8ef16090a5d9be04f15fc2..0000000000000000000000000000000000000000 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --config=conf/default.yaml \ - --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh b/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh deleted file mode 100755 index e95b0da8fd9c6cf6ec8ef16090a5d9be04f15fc2..0000000000000000000000000000000000000000 --- a/examples/vctk/GANVocoder/parallelwave_gan/ljspeech/synthesize.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --config=conf/default.yaml \ - --checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh b/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh deleted file mode 100755 index 42213058fa9fd569dcb8e5d51679f463cf569fed..0000000000000000000000000000000000000000 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/synthesize.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -python3 ../synthesize.py \ - --config=conf/default.yaml \ - --checkpoint=exp/default/checkpoints/snapshot_iter_35000.pdz_bak\ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test diff --git a/examples/vctk/README.md b/examples/vctk/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4007c0319f4fd92fbb1ffaf4d3ff37127376d1a2 --- /dev/null +++ b/examples/vctk/README.md @@ -0,0 +1,11 @@ + +# VCTK + +* tts0 - Tactron2 +* tts1 - TransformerTTS +* tts2 - SpeedySpeech +* tts3 - FastSpeech2 +* voc0 - WaveFlow +* voc1 - Parallel WaveGAN +* voc2 - MelGAN +* voc3 - MultiBand MelGAN diff --git a/examples/vctk/fastspeech2/aishell3/synthesize.sh b/examples/vctk/fastspeech2/aishell3/synthesize.sh deleted file mode 100755 index 950b2077f2f7fb2d240603f8fea8b6321eb35760..0000000000000000000000000000000000000000 --- a/examples/vctk/fastspeech2/aishell3/synthesize.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ../synthesize.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \ - --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test-metadata=dump/test/norm/metadata.jsonl \ - --output-dir=exp/default/test \ - --device="gpu" \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh b/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh deleted file mode 100755 index 315337143fc3bf357f785fbc8afbd78b09fe1a3f..0000000000000000000000000000000000000000 --- a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 synthesize_e2e.py \ - --fastspeech2-config=conf/default.yaml \ - --fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \ - --fastspeech2-stat=dump/train/speech_stats.npy \ - --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --text=../sentences.txt \ - --output-dir=exp/default/test_e2e \ - --device="gpu" \ - --phones-dict=dump/phone_id_map.txt \ - --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/baker/run.sh b/examples/vctk/fastspeech2/baker/run.sh deleted file mode 100755 index 3e9a5e22222d7b61514564af227e17be52944fcc..0000000000000000000000000000000000000000 --- a/examples/vctk/fastspeech2/baker/run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -python3 ../train.py \ - --train-metadata=dump/train/norm/metadata.jsonl \ - --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ - --nprocs=1 \ - --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/vctk/fastspeech2/sentences.txt b/examples/vctk/fastspeech2/sentences.txt deleted file mode 100644 index 3aa5376b4e08bae4e5f75ead2e419679dd2d6ffe..0000000000000000000000000000000000000000 --- a/examples/vctk/fastspeech2/sentences.txt +++ /dev/null @@ -1,16 +0,0 @@ -001 凯莫瑞安联合体的经济崩溃,迫在眉睫。 -002 对于所有想要离开那片废土,去寻找更美好生活的人来说。 -003 克哈,是你们所有人安全的港湾。 -004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。 -005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。 -006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。 -007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。 -008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。 -009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。 -010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。 -011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。 -012 法治是我们的命脉,然而它却受到前所未有的挑战。 -013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。 -014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。 -015 永远记住,谁才是最能保护你们的人。 -016 不要听信别人的谗言,我不是什么克隆人。 \ No newline at end of file diff --git a/examples/vctk/fastspeech2/vctk/README.md b/examples/vctk/tts3/README.md similarity index 65% rename from examples/vctk/fastspeech2/vctk/README.md rename to examples/vctk/tts3/README.md index 8dc939d85e1d5516baf96079fb990c72cc742500..2a79cdd6c7c4597e97c967ecc5f0b390aef93754 100644 --- a/examples/vctk/fastspeech2/vctk/README.md +++ b/examples/vctk/tts3/README.md @@ -12,13 +12,22 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith 1. `p315`, because no txt for it. 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. Assume the path to the MFA result of VCTK is `./vctk_alignment`. -Run the command below to preprocess the dataset. - +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. + - synthesize waveform from text file. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -43,11 +52,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance. -## Train the model -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] @@ -81,14 +90,23 @@ optional arguments: 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. 6. `--phones-dict` is the path of the phone vocabulary file. -## Pretrained Model - -## Synthesize +### Synthesize +We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it. +```bash +unzip pwg_vctk_ckpt_0.5.zip +``` +Parallel WaveGAN checkpoint contains files listed below. +```text +pwg_vctk_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` -`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] @@ -128,19 +146,22 @@ optional arguments: --device DEVICE device type to use. --verbose VERBOSE verbose. ``` -`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file. +`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file. ```bash -./synthesize_e2e.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text -usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG] - [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] - [--fastspeech2-stat FASTSPEECH2_STAT] - [--pwg-config PWG_CONFIG] - [--pwg-checkpoint PWG_CHECKPOINT] - [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT] - [--text TEXT] [--output-dir OUTPUT_DIR] - [--device DEVICE] [--verbose VERBOSE] +usage: multi_spk_synthesize_e2e_en.py [-h] + [--fastspeech2-config FASTSPEECH2_CONFIG] + [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT] + [--fastspeech2-stat FASTSPEECH2_STAT] + [--pwg-config PWG_CONFIG] + [--pwg-checkpoint PWG_CHECKPOINT] + [--pwg-stat PWG_STAT] + [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] + [--text TEXT] [--output-dir OUTPUT_DIR] + [--device DEVICE] [--verbose VERBOSE] Synthesize with fastspeech2 & parallel wavegan. @@ -161,6 +182,8 @@ optional arguments: spectrogram when training parallel wavegan. --phones-dict PHONES_DICT phone vocabulary file. + --speaker-dict SPEAKER_DICT + speaker id map file. --text TEXT text to synthesize, a 'utt_id sentence' pair per line. --output-dir OUTPUT_DIR output dir. @@ -175,7 +198,34 @@ optional arguments: 5. `--output-dir` is the directory to save synthesized audio files. 6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis. -You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models. -```bash +## Pretrained Model +Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip) +FastSpeech2 checkpoint contains files listed below. +```text +fastspeech2_nosil_vctk_ckpt_0.5 +├── default.yaml # default config used to train fastspeech2 +├── phone_id_map.txt # phone vocabulary file when training fastspeech2 +├── snapshot_iter_66200.pdz # model parameters and optimizer states +├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2 +└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2 +``` +You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models. +```bash +source path.sh + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ + --fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \ + --fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \ + --fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \ + --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ + --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --text=${BIN_DIR}/../sentences.txt \ + --output-dir=exp/default/test_e2e \ + --device="gpu" \ + --phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ + --speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt ``` diff --git a/examples/vctk/fastspeech2/vctk/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml similarity index 100% rename from examples/vctk/fastspeech2/vctk/conf/default.yaml rename to examples/vctk/tts3/conf/default.yaml diff --git a/examples/vctk/fastspeech2/vctk/preprocess.sh b/examples/vctk/tts3/local/preprocess.sh similarity index 90% rename from examples/vctk/fastspeech2/vctk/preprocess.sh rename to examples/vctk/tts3/local/preprocess.sh index df4b634a3050c72d674b9865ce6515ded55a7184..4d589d666f29fc8dc5073e00a1a3bc2fa7916253 100755 --- a/examples/vctk/fastspeech2/vctk/preprocess.sh +++ b/examples/vctk/tts3/local/preprocess.sh @@ -1,9 +1,9 @@ #!/bin/bash -stage=1 +stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./vctk_alignment \ --output durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # extract features echo "Extract features ..." - python3 ../preprocess.py \ + python3 ${BIN_DIR}/preprocess.py \ --dataset=vctk \ --rootdir=~/datasets/VCTK-Corpus-0.92/ \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --num-cpu=20 \ --cut-sil=True fi @@ -46,7 +46,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize and covert phone/speaker to id, dev and test should use train's stats echo "Normalize ..." - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --speech-stats=dump/train/speech_stats.npy \ @@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt - python3 ../normalize.py \ + python3 ${BIN_DIR}/normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --speech-stats=dump/train/speech_stats.npy \ diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..ca11296916f82450c2de57283ef853a2f56594d1 --- /dev/null +++ b/examples/vctk/tts3/local/synthesize.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ + --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test \ + --device="gpu" \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh new file mode 100755 index 0000000000000000000000000000000000000000..d919bb08e10caaf8bf8a421bb386a41441a68d66 --- /dev/null +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \ + --fastspeech2-config=${config_path} \ + --fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --fastspeech2-stat=dump/train/speech_stats.npy \ + --pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ + --pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ + --pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output-dir=${train_output_path}/test_e2e \ + --device="gpu" \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/fastspeech2/vctk/run.sh b/examples/vctk/tts3/local/train.sh similarity index 61% rename from examples/vctk/fastspeech2/vctk/run.sh rename to examples/vctk/tts3/local/train.sh index d4f06da9180fe5a4ba5500040735910d35b8ca77..be6051c979d0ff319be06ca0899385b89e286fcf 100755 --- a/examples/vctk/fastspeech2/vctk/run.sh +++ b/examples/vctk/tts3/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash -python3 ../train.py \ +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=2 \ --phones-dict=dump/phone_id_map.txt \ --speaker-dict=dump/speaker_id_map.txt diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..561d01632ba4b63b582ef07e460d31842823f07c --- /dev/null +++ b/examples/vctk/tts3/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=fastspeech2 +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL} diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..0562ef3f408d64cad4339243c55a470d6d0d29a5 --- /dev/null +++ b/examples/vctk/tts3/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_331.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md b/examples/vctk/voc1/README.md similarity index 83% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md rename to examples/vctk/voc1/README.md index 29538dc46ad7938c1eebed615b2ffb15bd50de74..b74b9d4a74e0bfd699c5bc554fce9e9f1b7fb294 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/README.md +++ b/examples/vctk/voc1/README.md @@ -1,6 +1,7 @@ # Parallel WaveGAN with VCTK This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443). -## Preprocess the dataset + +## Dataset ### Download and Extract the datasaet Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`. @@ -11,12 +12,21 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith 1. `p315`, because no txt for it. 2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them. -### Preprocess the dataset +## Get Started Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`. Assume the path to the MFA result of VCTK is `./vctk_alignment`. -Run the command below to preprocess the dataset. +Run the command below to +1. **source path**. +2. preprocess the dataset, +3. train the model. +4. synthesize wavs. + - synthesize waveform from `metadata.jsonl`. ```bash -./preprocess.sh +./run.sh +``` +### Preprocess the dataset +```bash +./local/preprocess.sh ${conf_path} ``` When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below. @@ -38,12 +48,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance. -## Train the model - -`./run.sh` calls `../train.py`. +### Train the model ```bash -./run.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} ``` +`./local/train.sh` calls `${BIN_DIR}/train.py`. Here's the complete help message. ```text @@ -88,15 +97,10 @@ benchmark: 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory. 4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported. 5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'. - -## Pretrained Models - - -## Synthesize - -`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`. +### Synthesize +`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash -./synthesize.sh +CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` ```text usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT] @@ -124,5 +128,16 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. +## Pretrained Models +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip). + +Parallel WaveGAN checkpoint contains files listed below. + +```text +pwg_vctk_ckpt_0.5 +├── pwg_default.yaml # default config used to train parallel wavegan +├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan +└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/conf/default.yaml rename to examples/vctk/voc1/conf/default.yaml diff --git a/examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh similarity index 83% rename from examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh rename to examples/vctk/voc1/local/preprocess.sh index 3ed4c0cccf2d66bcb6e74a319a2a0fbd7f6445fb..88a478cd537c32632f680f6a5c9f6f0576e4d93b 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/vctk/preprocess.sh +++ b/examples/vctk/voc1/local/preprocess.sh @@ -3,7 +3,7 @@ stage=0 stop_stage=100 -export MAIN_ROOT=`realpath ${PWD}/../../../../` +config_path=$1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # get durations from MFA's result @@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ --inputdir=./vctk_alignment \ --output=durations.txt \ - --config=conf/default.yaml + --config=${config_path} fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features echo "Extract features ..." - python3 ../../preprocess.py \ + python3 ${BIN_DIR}/../preprocess.py \ --rootdir=~/datasets/VCTK-Corpus-0.92/ \ --dataset=vctk \ --dumpdir=dump \ --dur-file=durations.txt \ - --config=conf/default.yaml \ + --config=${config_path} \ --cut-sil=True \ --num-cpu=20 fi @@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # normalize, dev and test should use train's stats echo "Normalize ..." - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/train/raw/metadata.jsonl \ --dumpdir=dump/train/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/dev/raw/metadata.jsonl \ --dumpdir=dump/dev/norm \ --stats=dump/train/feats_stats.npy - python3 ../../normalize.py \ + python3 ${BIN_DIR}/../normalize.py \ --metadata=dump/test/raw/metadata.jsonl \ --dumpdir=dump/test/norm \ --stats=dump/train/feats_stats.npy diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..9f904ac0c6e7006ab40c3d8aaa7c457ad1495b36 --- /dev/null +++ b/examples/vctk/voc1/local/synthesize.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/synthesize.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --test-metadata=dump/test/norm/metadata.jsonl \ + --output-dir=${train_output_path}/test diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh b/examples/vctk/voc1/local/train.sh similarity index 60% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh rename to examples/vctk/voc1/local/train.sh index df8cefd88eb0c331bc58516a760440255275353e..1ef860c36a527aa361dc3a081febae20557e34c7 100755 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/run.sh +++ b/examples/vctk/voc1/local/train.sh @@ -1,10 +1,13 @@ #!/bin/bash +config_path=$1 +train_output_path=$2 + FLAGS_cudnn_exhaustive_search=true \ FLAGS_conv_workspace_size_limit=4000 \ -python ../train.py \ +python ${BIN_DIR}/train.py \ --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=conf/default.yaml \ - --output-dir=exp/default \ + --config=${config_path} \ + --output-dir=${train_output_path} \ --nprocs=1 diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..28d39ae0078daefa770c0b9c27c4bafe6e40f254 --- /dev/null +++ b/examples/vctk/voc1/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=parallelwave_gan +export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL} \ No newline at end of file diff --git a/examples/vctk/voc1/run.sh b/examples/vctk/voc1/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..7d0fdb21e32ac7d0a66cc9598884baa5954a5707 --- /dev/null +++ b/examples/vctk/voc1/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_5000.pdz + +# with the following command, you can choice the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md deleted file mode 100644 index 9512ac09c1031228e9d6e395e97118df1e95d93e..0000000000000000000000000000000000000000 --- a/examples/voxceleb/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Voxceleb - -* spk0 - ge2e diff --git a/examples/voxceleb/spk0/local/ge2e/README_cn.md b/examples/voxceleb/spk0/local/ge2e/README_cn.md deleted file mode 100644 index 7777e4dd4822bcee06b40a6470339cd599c051fe..0000000000000000000000000000000000000000 --- a/examples/voxceleb/spk0/local/ge2e/README_cn.md +++ /dev/null @@ -1,124 +0,0 @@ -# Speaker Encoder - -本实验是的在多说话人数据集上以 Speaker Verification 为任务训练一个 speaker encoder, 这是作为 transfer learning from speaker verification to multispeaker text-to-speech synthesis 实验的一部分, 可以在 [tacotron2_aishell3](../tacotron2_aishell3) 中找到。用训练好的模型来提取音频的 utterance embedding. - -## 模型 - -本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。 - -## 目录结构 - -```text -ge2e -├── README_cn.md -├── audio_processor.py -├── config.py -├── dataset_processors.py -├── inference.py -├── preprocess.py -├── random_cycle.py -├── speaker_verification_dataset.py -└── train.py -``` - -## 数据集下载 - -本实验支持了 Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata 数据集。可以在对应的页面下载。 - -1. Librispeech/train-other-500 - - 英文多说话人数据集,[下载链接](https://www.openslr.org/resources/12/train-other-500.tar.gz),我们的实验中仅用到了 train-other-500 这个子集。 - -2. VoxCeleb1 - - 英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev D 四个压缩文件并合并解压。 - -3. VoxCeleb2 - - 英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev H 八个压缩文件并合并解压。 - -4. Aidatatang-200zh - - 中文多说话人数据集,[下载链接](https://www.openslr.org/62/)。 - -5. magicdata - - 中文多说话人数据集,[下载链接](https://www.openslr.org/68/)。 - -如果用户需要使用其他的数据集,也可以自行下载并进行数据处理,只要符合如下的要求。 - -## 数据集预处理 - -训练中使用的数据集是多说话人数据集,transcription 并不会被使用。为了扩大数据的量,训练过程可以将多个数据集合并为一个。处理后的文件结果组织方式如下,每个句子的频谱存储为 `.npy` 格式。以 speaker-utterance 的两层目录结构存储。因为合并数据集的原因,为了避免 speaker id 冲突,dataset 名会被添加到 speaker id 前面。 - -```text -dataset_root -├── dataset01_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy -├── dataset01_speaker02/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy -├── dataset02_speaker01/ -│   ├── utterance01.npy -│   ├── utterance02.npy -│   └── utterance03.npy -└── dataset02_speaker02/ -    ├── utterance01.npy -    ├── utterance02.npy -    └── utterance03.npy -``` - -运行数据处理脚本 - -```bash -python preprocess.py --datasets_root= --output_dir= --dataset_names= -``` - -其中 datasets_root 是包含多个原始数据集的路径,--output_dir 是多个数据集合并后输出的路径,dataset_names 是数据集的名称,多个数据集可以用逗号分割,比如 'librispeech_other, voxceleb1'. 目前支持的数据集有 librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh, magicdata. - -## 训练 - -数据处理完成后,使用如下的脚本训练。 - -```bash -python train.py --data= --output= --device="gpu" --nprocs=1 -``` - -- `--data` 是处理后的数据集路径。 -- `--output` 是训练结果的保存路径,一般使用 runs 下的一个子目录。保存结果包含 visualdl 的 log 文件,文本 log 记录,运行 config 备份,以及 checkpoints 目录,里面包含参数文件和优化器状态文件。如果指定的 output 路径包含此前的训练结果,训练前会自动加载最近的参数文件和优化器状态文件。 -- `--device` 是运行设备,目前支持 'cpu' 和 'gpu'. -- `--nprocs` 是指定运行进程数。目前仅在使用 'gpu' 是支持多进程训练。可以配合 `CUDA_VISIBLE_DEVICES` 环境变量指定可见卡号。 - -另外还有几个选项。 - -- `--config` 是用于覆盖默认配置(默认配置可以查看 `config.py`) 的配置文件,为 `.yaml` 文件。 -- `--opts` 是用命令行参数进一步覆盖配置。这是最后一个传入的命令行选项,用多组空格分隔的 KEY VALUE 对的方式传入。 -- `--checkpoint_path` 指定从中恢复的 checkpoint, 不需要包含扩展名。同名的参数文件( `.pdparams`) 和优化器文件( `.pdopt`)会被加载以恢复训练。这个参数指定的恢复训练优先级高于自动从 `output` 文件夹中恢复训练。 - -## 预训练模型 - -预训练模型是在 Librispeech-other-500 和 voxceleb1 上训练到 1560k steps 后用 aidatatang_200h 和 magic_data 训练到 3000k 的结果。 - -下载链接 [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip) - -## 预测 - -使用训练好的模型进行预测,对一个数据集中的所有 utterance 生成一个 embedding. - -```bash -python inference.py --input= --output= --checkpoint_path= --device="gpu" -``` - -- `--input` 是需要处理的数据集的路径。 -- `--output` 是处理的结果,它会保持和 `--input` 相同的文件夹结构,对应 input 中的每一个音频文件会有一个同名的 `*.npy` 文件,是从这个音频文件中提取到的 utterance embedding. -- `--checkpoint_path` 为用于预测的参数文件路径,不包含扩展名。 -- `--pattern` 是用于筛选数据集中需要处理的音频文件的通配符模式,默认为 `*.wav`. -- `--device` 和 `--opts` 的语义和训练脚本一致。 - -## 参考文献 - -1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) -2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) diff --git a/examples/voxceleb/spk0/run.sh b/examples/voxceleb/spk0/run.sh deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/parakeet/__init__.py b/parakeet/__init__.py index 87528b83309f9dbb5b7f00940f1e43596aea75f8..8a0acc48ac5f11884f1f2b957c74ba1f41a79b4f 100644 --- a/parakeet/__init__.py +++ b/parakeet/__init__.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging + from . import data from . import datasets +from . import exps from . import frontend from . import models from . import modules diff --git a/parakeet/audio/__init__.py b/parakeet/audio/__init__.py index 23e378fde7b0914a21cae4ee68a3b0c72fd0dfde..7747b794536aab1aab1de18dc0b686b8814efdbb 100644 --- a/parakeet/audio/__init__.py +++ b/parakeet/audio/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .audio import AudioProcessor from .spec_normalizer import LogMagnitude from .spec_normalizer import NormalizerBase diff --git a/parakeet/data/__init__.py b/parakeet/data/__init__.py index 6f15dbbecae61390798d8270ec061e242d557227..c71c05bd7f0218d0bc373e57386aa46eeea0fdb4 100644 --- a/parakeet/data/__init__.py +++ b/parakeet/data/__init__.py @@ -13,6 +13,5 @@ # limitations under the License. """Parakeet's infrastructure for data processing. """ - -from .dataset import * from .batch import * +from .dataset import * diff --git a/parakeet/datasets/__init__.py b/parakeet/datasets/__init__.py index cbdcdfa49a6ef89ecba9282a94b45d3f7f54eecb..fc64a82f2700b0183351c396f563ad0efd7d48e8 100644 --- a/parakeet/datasets/__init__.py +++ b/parakeet/datasets/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .common import * from .ljspeech import * diff --git a/parakeet/datasets/preprocess_utils.py b/parakeet/datasets/preprocess_utils.py index ddbedf5ce4bad4a86b2da69f206396e61712b106..8b01f6c3cc2d5496377c547f85da8089d811ac66 100644 --- a/parakeet/datasets/preprocess_utils.py +++ b/parakeet/datasets/preprocess_utils.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import re diff --git a/parakeet/datasets/vocoder_batch_fn.py b/parakeet/datasets/vocoder_batch_fn.py index 925303b5fd7e3d8a30b104b5d1da84c00a8eb9e6..30adb142d0c6db69247301cd7d1177d7736cc698 100644 --- a/parakeet/datasets/vocoder_batch_fn.py +++ b/parakeet/datasets/vocoder_batch_fn.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import numpy as np import paddle diff --git a/parakeet/exps/__init__.py b/parakeet/exps/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/parakeet/exps/__init__.py +++ b/parakeet/exps/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/parakeet/exps/fastspeech2/__init__.py b/parakeet/exps/fastspeech2/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/parakeet/exps/fastspeech2/__init__.py +++ b/parakeet/exps/fastspeech2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.py b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py similarity index 100% rename from examples/vctk/fastspeech2/aishell3/synthesize_e2e.py rename to parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py index 13f59bfd93cd14165681a1c806d96aa41ff14f2f..825b3ed364f0673913fb10cb519808cb7f07b9a9 100644 --- a/examples/vctk/fastspeech2/aishell3/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -20,13 +19,14 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.frontend.zh_frontend import Frontend from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, fastspeech2_config, pwg_config): diff --git a/examples/vctk/fastspeech2/vctk/synthesize_e2e.py b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py similarity index 100% rename from examples/vctk/fastspeech2/vctk/synthesize_e2e.py rename to parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py index 65a927b4f2b991c0eaf7347e19634bbca065ae3a..a476198269af4ba4271eb3ab24a98cd04956fb83 100644 --- a/examples/vctk/fastspeech2/vctk/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -20,13 +19,14 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.frontend import English from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, fastspeech2_config, pwg_config): diff --git a/examples/vctk/fastspeech2/normalize.py b/parakeet/exps/fastspeech2/normalize.py similarity index 100% rename from examples/vctk/fastspeech2/normalize.py rename to parakeet/exps/fastspeech2/normalize.py index 2d40bdf37f885bd599514788463e1b5c11a1f406..b4b31e3111b00b66fa71d1609ef687307450b816 100644 --- a/examples/vctk/fastspeech2/normalize.py +++ b/parakeet/exps/fastspeech2/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/examples/vctk/fastspeech2/preprocess.py b/parakeet/exps/fastspeech2/preprocess.py similarity index 100% rename from examples/vctk/fastspeech2/preprocess.py rename to parakeet/exps/fastspeech2/preprocess.py index ee2b3f915b81fc47db744c5e75fda30faff505e1..bb796b64c7b969483c3cb2afa947de7c19dd6751 100644 --- a/examples/vctk/fastspeech2/preprocess.py +++ b/parakeet/exps/fastspeech2/preprocess.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from concurrent.futures import ThreadPoolExecutor @@ -26,6 +25,8 @@ import librosa import numpy as np import tqdm import yaml +from yacs.config import CfgNode + from parakeet.data.get_feats import Energy from parakeet.data.get_feats import LogMelFBank from parakeet.data.get_feats import Pitch @@ -34,7 +35,6 @@ from parakeet.datasets.preprocess_utils import get_input_token from parakeet.datasets.preprocess_utils import get_phn_dur from parakeet.datasets.preprocess_utils import get_spk_id_map from parakeet.datasets.preprocess_utils import merge_silence -from yacs.config import CfgNode def process_sentence(config: Dict[str, Any], diff --git a/examples/vctk/fastspeech2/synthesize.py b/parakeet/exps/fastspeech2/synthesize.py similarity index 100% rename from examples/vctk/fastspeech2/synthesize.py rename to parakeet/exps/fastspeech2/synthesize.py index c1329f8d03764045937e0000db29f83dbc03f08a..913277571de5d1568eccccbc18ba310232c815c5 100644 --- a/examples/vctk/fastspeech2/synthesize.py +++ b/parakeet/exps/fastspeech2/synthesize.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -22,6 +21,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference diff --git a/examples/vctk/fastspeech2/baker/synthesize_e2e.py b/parakeet/exps/fastspeech2/synthesize_e2e.py similarity index 100% rename from examples/vctk/fastspeech2/baker/synthesize_e2e.py rename to parakeet/exps/fastspeech2/synthesize_e2e.py index 75e06edf78aaf254f968fa090a74fdd12612d4bf..dd1b57c8a9400f869c14268798a85c190bb599db 100644 --- a/examples/vctk/fastspeech2/baker/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/synthesize_e2e.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -20,13 +19,14 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.frontend.zh_frontend import Frontend from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, fastspeech2_config, pwg_config): diff --git a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py b/parakeet/exps/fastspeech2/synthesize_e2e_en.py similarity index 100% rename from examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py rename to parakeet/exps/fastspeech2/synthesize_e2e_en.py index 6732aa40883e01189732cbed63563b7e62957b35..4e8a20c757e1c4925d5f46134f0bf7a36d08be9d 100644 --- a/examples/vctk/fastspeech2/ljspeech/synthesize_e2e.py +++ b/parakeet/exps/fastspeech2/synthesize_e2e_en.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -21,6 +20,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.frontend import English from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Inference diff --git a/examples/vctk/fastspeech2/train.py b/parakeet/exps/fastspeech2/train.py similarity index 100% rename from examples/vctk/fastspeech2/train.py rename to parakeet/exps/fastspeech2/train.py index 1ea2c561e96ef56f62eda1a39c73e837458019b5..59b1ea3af5eb017dce1f20749097a3181359f876 100644 --- a/examples/vctk/fastspeech2/train.py +++ b/parakeet/exps/fastspeech2/train.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os import shutil +from pathlib import Path import jsonlines import numpy as np @@ -25,9 +25,12 @@ from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from parakeet.datasets.data_table import DataTable -from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn +from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn +from parakeet.datasets.data_table import DataTable from parakeet.models.fastspeech2 import FastSpeech2 from parakeet.models.fastspeech2 import FastSpeech2Evaluator from parakeet.models.fastspeech2 import FastSpeech2Updater @@ -36,9 +39,6 @@ from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/examples/vctk/GANVocoder/README.md b/parakeet/exps/gan_vocoder/README.md similarity index 100% rename from examples/vctk/GANVocoder/README.md rename to parakeet/exps/gan_vocoder/README.md diff --git a/parakeet/exps/gan_vocoder/__init__.py b/parakeet/exps/gan_vocoder/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..abf198b97e6e818e1fbe59006f98492640bcee54 100644 --- a/parakeet/exps/gan_vocoder/__init__.py +++ b/parakeet/exps/gan_vocoder/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/vctk/GANVocoder/normalize.py b/parakeet/exps/gan_vocoder/normalize.py similarity index 100% rename from examples/vctk/GANVocoder/normalize.py rename to parakeet/exps/gan_vocoder/normalize.py index 74d838adbc6cd4af3017e73343d49c07fb384f62..c772594bb499378ff7c537125faa58be532e0e36 100644 --- a/examples/vctk/GANVocoder/normalize.py +++ b/parakeet/exps/gan_vocoder/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py b/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..185a92b8d94d3426d616c0624f0f2ee04339349e --- /dev/null +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/vctk/GANVocoder/parallelwave_gan/synthesize.py b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/synthesize.py rename to parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py index e57ddf88028526091ec11887130c8a62fa0a519c..9129caa54ebab33e726d5ea215c9ff222d5f22a6 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/synthesize.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py @@ -11,11 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from pathlib import Path -from timer import timer import jsonlines import numpy as np @@ -23,9 +21,11 @@ import paddle import soundfile as sf import yaml from paddle import distributed as dist +from timer import timer +from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.models.parallel_wavegan import PWGGenerator -from yacs.config import CfgNode def main(): diff --git a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py similarity index 87% rename from examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py rename to parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index f20f0a7267606d22287bd8527535665d6a822d44..c451a51c1669c4666be918d8caeee3cbf15bd49b 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/baker/synthesize_from_wav.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -import os import logging +import os from pathlib import Path import librosa @@ -22,20 +21,12 @@ import numpy as np import paddle import soundfile as sf import yaml +from yacs.config import CfgNode + from parakeet.data.get_feats import LogMelFBank from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode as Configuration - - -def get_cfg_default(): - config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve() - with open(config_path, 'rt') as f: - _C = yaml.safe_load(f) - _C = Configuration(_C) - config = _C.clone() - return config def evaluate(args, config): @@ -91,7 +82,7 @@ def main(): description="Synthesize with parallel wavegan.") parser.add_argument( - "--config", type=str, help="config file to overwrite default config.") + "--config", type=str, help="parallel wavegan config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") parser.add_argument( "--stat", @@ -108,9 +99,8 @@ def main(): paddle.set_device(args.device) - config = get_cfg_default() - if args.config: - config.merge_from_file(args.config) + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) print("========Args========") print(yaml.safe_dump(vars(args))) diff --git a/examples/vctk/GANVocoder/parallelwave_gan/train.py b/parakeet/exps/gan_vocoder/parallelwave_gan/train.py similarity index 100% rename from examples/vctk/GANVocoder/parallelwave_gan/train.py rename to parakeet/exps/gan_vocoder/parallelwave_gan/train.py index 7e6aa9a6218261149198a9cd18ca0519874d2d98..7a16ca597ed49014011648a04d298a7a5906ef43 100644 --- a/examples/vctk/GANVocoder/parallelwave_gan/train.py +++ b/parakeet/exps/gan_vocoder/parallelwave_gan/train.py @@ -11,11 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os import shutil +from pathlib import Path import jsonlines import numpy as np @@ -28,20 +28,20 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam # No RAdaom from paddle.optimizer.lr import StepDecay +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.datasets.vocoder_batch_fn import Clip -from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGDiscriminator -from parakeet.models.parallel_wavegan import PWGUpdater from parakeet.models.parallel_wavegan import PWGEvaluator +from parakeet.models.parallel_wavegan import PWGGenerator +from parakeet.models.parallel_wavegan import PWGUpdater from parakeet.modules.stft_loss import MultiResolutionSTFTLoss from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/examples/vctk/GANVocoder/preprocess.py b/parakeet/exps/gan_vocoder/preprocess.py similarity index 100% rename from examples/vctk/GANVocoder/preprocess.py rename to parakeet/exps/gan_vocoder/preprocess.py index e9f18286992a5245d3d9edc3049d0e0aff3aca67..c10143c71666e5a7af415c8bcf1921cd05e699ca 100644 --- a/examples/vctk/GANVocoder/preprocess.py +++ b/parakeet/exps/gan_vocoder/preprocess.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os +from concurrent.futures import ThreadPoolExecutor from operator import itemgetter +from pathlib import Path from typing import Any from typing import Dict from typing import List @@ -24,12 +25,11 @@ import librosa import numpy as np import tqdm import yaml -from concurrent.futures import ThreadPoolExecutor +from yacs.config import CfgNode + from parakeet.data.get_feats import LogMelFBank from parakeet.datasets.preprocess_utils import get_phn_dur from parakeet.datasets.preprocess_utils import merge_silence -from pathlib import Path -from yacs.config import CfgNode def process_sentence(config: Dict[str, Any], diff --git a/parakeet/exps/gan_vocoder/pwgan/__init__.py b/parakeet/exps/gan_vocoder/pwgan/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/parakeet/exps/ge2e/__init__.py b/parakeet/exps/ge2e/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/ge2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/voxceleb/spk0/local/ge2e/audio_processor.py b/parakeet/exps/ge2e/audio_processor.py similarity index 99% rename from examples/voxceleb/spk0/local/ge2e/audio_processor.py rename to parakeet/exps/ge2e/audio_processor.py index 921e9990181c5af4513e957680dc468a112bbaba..2d6bbe34ef31461d19ff409c5f2c7830a03c7a3e 100644 --- a/examples/voxceleb/spk0/local/ge2e/audio_processor.py +++ b/parakeet/exps/ge2e/audio_processor.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import struct from pathlib import Path from warnings import warn -import struct -from scipy.ndimage.morphology import binary_dilation -import numpy as np import librosa +import numpy as np +from scipy.ndimage.morphology import binary_dilation try: import webrtcvad @@ -97,7 +96,7 @@ def trim_long_silences(wav, return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) - audio_mask = np.round(audio_mask).astype(np.bool) + audio_mask = np.round(audio_mask).astype(bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, diff --git a/examples/voxceleb/spk0/local/ge2e/config.py b/parakeet/exps/ge2e/config.py similarity index 99% rename from examples/voxceleb/spk0/local/ge2e/config.py rename to parakeet/exps/ge2e/config.py index b8d748aacb4c512d795b9da60639ed793d8978a2..3e114291647e5c7869c3f50c556cbae3c382bd92 100644 --- a/examples/voxceleb/spk0/local/ge2e/config.py +++ b/parakeet/exps/ge2e/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode _C = CfgNode() diff --git a/examples/voxceleb/spk0/local/ge2e/dataset_processors.py b/parakeet/exps/ge2e/dataset_processors.py similarity index 98% rename from examples/voxceleb/spk0/local/ge2e/dataset_processors.py rename to parakeet/exps/ge2e/dataset_processors.py index 50a8f3e73456107a90306fd06d37f8eac4b58938..29b584107b1a9c80202612cc9aacc42fad258b4d 100644 --- a/examples/voxceleb/spk0/local/ge2e/dataset_processors.py +++ b/parakeet/exps/ge2e/dataset_processors.py @@ -11,16 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import multiprocessing as mp from functools import partial -from typing import List from pathlib import Path -import multiprocessing as mp +from typing import List import numpy as np from tqdm import tqdm -from audio_processor import SpeakerVerificationPreprocessor +from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor): diff --git a/examples/voxceleb/spk0/local/ge2e/inference.py b/parakeet/exps/ge2e/inference.py similarity index 97% rename from examples/voxceleb/spk0/local/ge2e/inference.py rename to parakeet/exps/ge2e/inference.py index 1cca132dab176e8439aeb94bb05bebe60f07bd2f..156866627d6e4c5d4c52329723ed0fce90266487 100644 --- a/examples/voxceleb/spk0/local/ge2e/inference.py +++ b/parakeet/exps/ge2e/inference.py @@ -11,19 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path -import tqdm -import paddle import numpy as np +import paddle +import tqdm +from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from parakeet.exps.ge2e.config import get_cfg_defaults from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder -from audio_processor import SpeakerVerificationPreprocessor -from config import get_cfg_defaults - def embed_utterance(processor, model, fpath_or_wav): # audio processor diff --git a/examples/voxceleb/spk0/local/ge2e/preprocess.py b/parakeet/exps/ge2e/preprocess.py similarity index 90% rename from examples/voxceleb/spk0/local/ge2e/preprocess.py rename to parakeet/exps/ge2e/preprocess.py index b1e59460ec773ad08c14d888aabad385e82657fc..f6457251d135906260159053b49e16ae4685109f 100644 --- a/examples/voxceleb/spk0/local/ge2e/preprocess.py +++ b/parakeet/exps/ge2e/preprocess.py @@ -11,14 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path -from config import get_cfg_defaults + from audio_processor import SpeakerVerificationPreprocessor -from dataset_processors import (process_librispeech, process_voxceleb1, - process_voxceleb2, process_aidatatang_200zh, - process_magicdata) + +from parakeet.exps.ge2e.config import get_cfg_defaults +from parakeet.exps.ge2e.dataset_processors import process_aidatatang_200zh +from parakeet.exps.ge2e.dataset_processors import process_librispeech +from parakeet.exps.ge2e.dataset_processors import process_magicdata +from parakeet.exps.ge2e.dataset_processors import process_voxceleb1 +from parakeet.exps.ge2e.dataset_processors import process_voxceleb2 if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/examples/voxceleb/spk0/local/ge2e/random_cycle.py b/parakeet/exps/ge2e/random_cycle.py similarity index 99% rename from examples/voxceleb/spk0/local/ge2e/random_cycle.py rename to parakeet/exps/ge2e/random_cycle.py index 4a20158135ccd710c61ab02431f2ca513a9e965e..290fd2fa274b66f7802cb0ab529d04099118f624 100644 --- a/examples/voxceleb/spk0/local/ge2e/random_cycle.py +++ b/parakeet/exps/ge2e/random_cycle.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import random diff --git a/examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py b/parakeet/exps/ge2e/speaker_verification_dataset.py similarity index 97% rename from examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py rename to parakeet/exps/ge2e/speaker_verification_dataset.py index c9cfda29c8e104dae69a39bfc615cf8c177e6cf6..896676d96bf971632fd09d9d02e5cb369696fb10 100644 --- a/examples/voxceleb/spk0/local/ge2e/speaker_verification_dataset.py +++ b/parakeet/exps/ge2e/speaker_verification_dataset.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import random from pathlib import Path import numpy as np -from paddle.io import Dataset, BatchSampler +from paddle.io import BatchSampler +from paddle.io import Dataset -from random_cycle import random_cycle +from parakeet.exps.ge2e.random_cycle import random_cycle class MultiSpeakerMelDataset(Dataset): diff --git a/examples/voxceleb/spk0/local/ge2e/train.py b/parakeet/exps/ge2e/train.py similarity index 93% rename from examples/voxceleb/spk0/local/ge2e/train.py rename to parakeet/exps/ge2e/train.py index 950d486df4c21f860e63ceecc7e29d6613babbb6..7a59c436b45dc083797be1bf60c880dafcca8f71 100644 --- a/examples/voxceleb/spk0/local/ge2e/train.py +++ b/parakeet/exps/ge2e/train.py @@ -11,23 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time -from paddle import distributed as dist -from paddle.optimizer import Adam from paddle import DataParallel +from paddle import distributed as dist from paddle.io import DataLoader from paddle.nn.clip import ClipGradByGlobalNorm +from paddle.optimizer import Adam +from parakeet.exps.ge2e.config import get_cfg_defaults +from parakeet.exps.ge2e.speaker_verification_dataset import Collate +from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset +from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder -from parakeet.training import ExperimentBase from parakeet.training import default_argument_parser - -from speaker_verification_dataset import MultiSpeakerMelDataset -from speaker_verification_dataset import MultiSpeakerSampler -from speaker_verification_dataset import Collate -from config import get_cfg_defaults +from parakeet.training import ExperimentBase class Ge2eExperiment(ExperimentBase): diff --git a/examples/csmsc/speedyspeech/sentences.txt b/parakeet/exps/sentences.txt similarity index 100% rename from examples/csmsc/speedyspeech/sentences.txt rename to parakeet/exps/sentences.txt diff --git a/examples/vctk/fastspeech2/sentences_en.txt b/parakeet/exps/sentences_en.txt similarity index 100% rename from examples/vctk/fastspeech2/sentences_en.txt rename to parakeet/exps/sentences_en.txt diff --git a/parakeet/exps/speedyspeech/__init__.py b/parakeet/exps/speedyspeech/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/speedyspeech/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/csmsc/speedyspeech/baker/inference.py b/parakeet/exps/speedyspeech/inference.py similarity index 100% rename from examples/csmsc/speedyspeech/baker/inference.py rename to parakeet/exps/speedyspeech/inference.py index a1d185402cae5257577061f9df6a6d2c16f4fdc4..bf144d760f597cb72479beb87ea4f752eb97500b 100644 --- a/examples/csmsc/speedyspeech/baker/inference.py +++ b/parakeet/exps/speedyspeech/inference.py @@ -11,13 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import os from pathlib import Path import soundfile as sf from paddle import inference + from parakeet.frontend.zh_frontend import Frontend diff --git a/examples/csmsc/speedyspeech/normalize.py b/parakeet/exps/speedyspeech/normalize.py similarity index 100% rename from examples/csmsc/speedyspeech/normalize.py rename to parakeet/exps/speedyspeech/normalize.py index eeb58bb7c0d52a28599b01b7550fd7ccf80083a8..8f02c33cc4b33c7281a6bb017e2331e02d86f5ca 100644 --- a/examples/csmsc/speedyspeech/normalize.py +++ b/parakeet/exps/speedyspeech/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/examples/csmsc/speedyspeech/preprocess.py b/parakeet/exps/speedyspeech/preprocess.py similarity index 100% rename from examples/csmsc/speedyspeech/preprocess.py rename to parakeet/exps/speedyspeech/preprocess.py index 647c9b3633ac5032527b9329f40e2bfa63a839c9..f3ae294d8bf7674248432fc547a8f248ec68ad4a 100644 --- a/examples/csmsc/speedyspeech/preprocess.py +++ b/parakeet/exps/speedyspeech/preprocess.py @@ -11,27 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import argparse +import re +from concurrent.futures import ThreadPoolExecutor from operator import itemgetter +from pathlib import Path from typing import Any from typing import Dict from typing import List -import argparse import jsonlines import librosa import numpy as np -import re import tqdm import yaml -from concurrent.futures import ThreadPoolExecutor +from yacs.config import CfgNode + from parakeet.data.get_feats import LogMelFBank from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length -from parakeet.datasets.preprocess_utils import get_phones_tones from parakeet.datasets.preprocess_utils import get_phn_dur +from parakeet.datasets.preprocess_utils import get_phones_tones from parakeet.datasets.preprocess_utils import merge_silence -from pathlib import Path -from yacs.config import CfgNode def process_sentence(config: Dict[str, Any], diff --git a/examples/csmsc/speedyspeech/synthesize.py b/parakeet/exps/speedyspeech/synthesize.py similarity index 100% rename from examples/csmsc/speedyspeech/synthesize.py rename to parakeet/exps/speedyspeech/synthesize.py index 4225071ec075172206afd4a85294c18f70a0a2a5..43ab4a69bf28510ff69c6f19466b7b79b1cd776c 100644 --- a/examples/csmsc/speedyspeech/synthesize.py +++ b/parakeet/exps/speedyspeech/synthesize.py @@ -11,25 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import logging import argparse +import logging +import os from pathlib import Path import jsonlines import numpy as np -import soundfile as sf import paddle +import soundfile as sf import yaml from paddle import jit from paddle.static import InputSpec from yacs.config import CfgNode from parakeet.datasets.data_table import DataTable -from parakeet.models.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.modules.normalizer import ZScore diff --git a/examples/csmsc/speedyspeech/baker/synthesize_e2e.py b/parakeet/exps/speedyspeech/synthesize_e2e.py similarity index 100% rename from examples/csmsc/speedyspeech/baker/synthesize_e2e.py rename to parakeet/exps/speedyspeech/synthesize_e2e.py index 6dd3abd1a2e6f5cb7dae30d0717f5250c84cb5f3..47e064e958b65689efee52d011e9facbc9671d31 100644 --- a/examples/csmsc/speedyspeech/baker/synthesize_e2e.py +++ b/parakeet/exps/speedyspeech/synthesize_e2e.py @@ -11,25 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os from pathlib import Path import numpy as np -import soundfile as sf import paddle +import soundfile as sf import yaml from paddle import jit from paddle.static import InputSpec +from yacs.config import CfgNode + from parakeet.frontend.zh_frontend import Frontend -from parakeet.models.speedyspeech import SpeedySpeech -from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.models.parallel_wavegan import PWGGenerator from parakeet.models.parallel_wavegan import PWGInference +from parakeet.models.speedyspeech import SpeedySpeech +from parakeet.models.speedyspeech import SpeedySpeechInference from parakeet.modules.normalizer import ZScore -from yacs.config import CfgNode def evaluate(args, speedyspeech_config, pwg_config): diff --git a/examples/csmsc/speedyspeech/train.py b/parakeet/exps/speedyspeech/train.py similarity index 100% rename from examples/csmsc/speedyspeech/train.py rename to parakeet/exps/speedyspeech/train.py index f7a4e3018c6c45c020e3c81c3af44b6500fa83b6..ea9fe20d7ffa0d903d52245740e8ae1c4e4a46b0 100644 --- a/examples/csmsc/speedyspeech/train.py +++ b/parakeet/exps/speedyspeech/train.py @@ -11,22 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging import os import shutil +from pathlib import Path import jsonlines import numpy as np import paddle import yaml -from paddle import distributed as dist from paddle import DataParallel +from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from parakeet.datasets.data_table import DataTable +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.am_batch_fn import speedyspeech_batch_fn +from parakeet.datasets.data_table import DataTable from parakeet.models.speedyspeech import SpeedySpeech from parakeet.models.speedyspeech import SpeedySpeechEvaluator from parakeet.models.speedyspeech import SpeedySpeechUpdater @@ -35,9 +38,6 @@ from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from pathlib import Path -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/parakeet/exps/tacotron2/__init__.py b/parakeet/exps/tacotron2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/tacotron2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/ljspeech/tts0/local/tacotron2/config.py b/parakeet/exps/tacotron2/config.py similarity index 99% rename from examples/ljspeech/tts0/local/tacotron2/config.py rename to parakeet/exps/tacotron2/config.py index e370e77a8fb285c39dc5a5569c75ff98741d2fc2..0ce2df368424d6b2540278dc7c5daa23d478dbc6 100644 --- a/examples/ljspeech/tts0/local/tacotron2/config.py +++ b/parakeet/exps/tacotron2/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode as CN _C = CN() diff --git a/examples/ljspeech/tts0/local/tacotron2/ljspeech.py b/parakeet/exps/tacotron2/ljspeech.py similarity index 97% rename from examples/ljspeech/tts0/local/tacotron2/ljspeech.py rename to parakeet/exps/tacotron2/ljspeech.py index 76e4b3a6efc1e6c2cd50c3c4c065b33213ed3be5..20dc29d37c8d13ec12623e58e7883fefa17e3e78 100644 --- a/examples/ljspeech/tts0/local/tacotron2/ljspeech.py +++ b/parakeet/exps/tacotron2/ljspeech.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from pathlib import Path import pickle +from pathlib import Path import numpy as np from paddle.io import Dataset -from parakeet.data.batch import batch_spec, batch_text_id +from parakeet.data.batch import batch_spec +from parakeet.data.batch import batch_text_id class LJSpeech(Dataset): diff --git a/examples/ljspeech/tts0/local/tacotron2/preprocess.py b/parakeet/exps/tacotron2/preprocess.py similarity index 95% rename from examples/ljspeech/tts0/local/tacotron2/preprocess.py rename to parakeet/exps/tacotron2/preprocess.py index aa7bf244964e4b7db79ac5f6e04e13491c608527..893444855d05887e2e1646a5d04decaaa38df59a 100644 --- a/examples/ljspeech/tts0/local/tacotron2/preprocess.py +++ b/parakeet/exps/tacotron2/preprocess.py @@ -11,21 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import argparse import os import pickle -import argparse from pathlib import Path -import tqdm import numpy as np +import tqdm +from parakeet.audio import AudioProcessor +from parakeet.audio import LogMagnitude from parakeet.datasets import LJSpeechMetaData -from parakeet.audio import AudioProcessor, LogMagnitude +from parakeet.exps.tacotron2.config import get_cfg_defaults from parakeet.frontend import EnglishCharacter -from config import get_cfg_defaults - def create_dataset(config, source_path, target_path, verbose=False): # create output dir diff --git a/examples/ljspeech/tts0/local/tacotron2/synthesize.ipynb b/parakeet/exps/tacotron2/synthesize.ipynb similarity index 100% rename from examples/ljspeech/tts0/local/tacotron2/synthesize.ipynb rename to parakeet/exps/tacotron2/synthesize.ipynb diff --git a/examples/ljspeech/tts0/local/tacotron2/synthesize.py b/parakeet/exps/tacotron2/synthesize.py similarity index 91% rename from examples/ljspeech/tts0/local/tacotron2/synthesize.py rename to parakeet/exps/tacotron2/synthesize.py index f933c32c1eb4f71321039a05a635228f04eb215a..56257c9b047de6943ccaefe1bdd2df2c4a25a475 100644 --- a/examples/ljspeech/tts0/local/tacotron2/synthesize.py +++ b/parakeet/exps/tacotron2/synthesize.py @@ -11,20 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from pathlib import Path -import paddle import numpy as np +import paddle from matplotlib import pyplot as plt +from parakeet.exps.tacotron2.config import get_cfg_defaults from parakeet.frontend import EnglishCharacter from parakeet.models.tacotron2 import Tacotron2 from parakeet.utils import display -from config import get_cfg_defaults - def main(config, args): paddle.set_device(args.device) @@ -36,8 +34,13 @@ def main(config, args): # inputs input_path = Path(args.input).expanduser() + sentences = [] with open(input_path, "rt") as f: - sentences = f.readlines() + for line in f: + line_list = line.strip().split() + utt_id = line_list[0] + sentence = " ".join(line_list[1:]) + sentences.append((utt_id, sentence)) if args.output is None: output_dir = input_path.parent / "synthesis" diff --git a/examples/ljspeech/tts0/local/tacotron2/train.py b/parakeet/exps/tacotron2/train.py similarity index 95% rename from examples/ljspeech/tts0/local/tacotron2/train.py rename to parakeet/exps/tacotron2/train.py index 82dd4c32e96dced4742174c16bb3b5efa70b0253..3677c271d55c440472caa437a1dbcd096387983b 100644 --- a/examples/ljspeech/tts0/local/tacotron2/train.py +++ b/parakeet/exps/tacotron2/train.py @@ -11,23 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time from collections import defaultdict import numpy as np import paddle +from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from paddle import distributed as dist + from parakeet.data import dataset +from parakeet.exps.tacotron2.config import get_cfg_defaults +from parakeet.exps.tacotron2.ljspeech import LJSpeech +from parakeet.exps.tacotron2.ljspeech import LJSpeechCollector +from parakeet.models.tacotron2 import Tacotron2 +from parakeet.models.tacotron2 import Tacotron2Loss from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase -from parakeet.utils import display, mp_tools -from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss - -from config import get_cfg_defaults -from ljspeech import LJSpeech, LJSpeechCollector +from parakeet.utils import display +from parakeet.utils import mp_tools class Experiment(ExperimentBase): diff --git a/parakeet/exps/transformer_tts/__init__.py b/parakeet/exps/transformer_tts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/transformer_tts/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/ljspeech/tts1/local/transformer_tts/normalize.py b/parakeet/exps/transformer_tts/normalize.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/normalize.py rename to parakeet/exps/transformer_tts/normalize.py index a666ca2f295edff60af21a0134e3dfd391b2d97f..127449ee3ee90075b9e8e30ffcd4a017348b7b35 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/normalize.py +++ b/parakeet/exps/transformer_tts/normalize.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Normalize feature files and dump them.""" - import argparse import logging from operator import itemgetter @@ -20,10 +19,11 @@ from pathlib import Path import jsonlines import numpy as np -from parakeet.datasets.data_table import DataTable from sklearn.preprocessing import StandardScaler from tqdm import tqdm +from parakeet.datasets.data_table import DataTable + def main(): """Run preprocessing process.""" diff --git a/examples/ljspeech/tts1/local/transformer_tts/preprocess.py b/parakeet/exps/transformer_tts/preprocess.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/preprocess.py rename to parakeet/exps/transformer_tts/preprocess.py index 0f998bc306e5dd25d1fa4dea43cad44120d9dac0..96696eaedff181b3e6b3873b20fc764f2573c83f 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/preprocess.py +++ b/parakeet/exps/transformer_tts/preprocess.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse from concurrent.futures import ThreadPoolExecutor from operator import itemgetter @@ -25,9 +24,10 @@ import librosa import numpy as np import tqdm import yaml +from yacs.config import CfgNode as Configuration + from parakeet.data.get_feats import LogMelFBank from parakeet.frontend import English -from yacs.config import CfgNode as Configuration def get_lj_sentences(file_name, frontend): diff --git a/examples/ljspeech/tts1/local/transformer_tts/synthesize.py b/parakeet/exps/transformer_tts/synthesize.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/synthesize.py rename to parakeet/exps/transformer_tts/synthesize.py index 21614c539d1427251786b5ba8c24cfc46cd5cf0a..5c1945d28d37b9d2e98de5991a75eb16f7b28212 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/synthesize.py +++ b/parakeet/exps/transformer_tts/synthesize.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -22,6 +21,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.datasets.data_table import DataTable from parakeet.models.transformer_tts import TransformerTTS from parakeet.models.transformer_tts import TransformerTTSInference diff --git a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py b/parakeet/exps/transformer_tts/synthesize_e2e.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py rename to parakeet/exps/transformer_tts/synthesize_e2e.py index 7ca75a8f496a68b51e214b509991edd5b4e9acb6..2bee77d35bfae18e3a02145f4f0c3d4c265e67f4 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/ljspeech/synthesize_e2e.py +++ b/parakeet/exps/transformer_tts/synthesize_e2e.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import logging from pathlib import Path @@ -21,6 +20,7 @@ import paddle import soundfile as sf import yaml from yacs.config import CfgNode + from parakeet.frontend import English from parakeet.models.transformer_tts import TransformerTTS from parakeet.models.transformer_tts import TransformerTTSInference diff --git a/examples/ljspeech/tts1/local/transformer_tts/train.py b/parakeet/exps/transformer_tts/train.py similarity index 100% rename from examples/ljspeech/tts1/local/transformer_tts/train.py rename to parakeet/exps/transformer_tts/train.py index b1263bcca3cde1a2c87c72f4439f09fa8db9efe4..fdaff347521e073af5f51c93d0810242021743c8 100644 --- a/examples/ljspeech/tts1/local/transformer_tts/train.py +++ b/parakeet/exps/transformer_tts/train.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -import os import logging +import os import shutil from pathlib import Path @@ -26,18 +25,19 @@ from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from parakeet.datasets.data_table import DataTable +from visualdl import LogWriter +from yacs.config import CfgNode + from parakeet.datasets.am_batch_fn import transformer_single_spk_batch_fn +from parakeet.datasets.data_table import DataTable from parakeet.models.transformer_tts import TransformerTTS -from parakeet.models.transformer_tts import TransformerTTSUpdater from parakeet.models.transformer_tts import TransformerTTSEvaluator +from parakeet.models.transformer_tts import TransformerTTSUpdater from parakeet.training.extensions.snapshot import Snapshot from parakeet.training.extensions.visualizer import VisualDL from parakeet.training.optimizer import build_optimizers from parakeet.training.seeding import seed_everything from parakeet.training.trainer import Trainer -from visualdl import LogWriter -from yacs.config import CfgNode def train_sp(args, config): diff --git a/parakeet/exps/voice_cloning/__init__.py b/parakeet/exps/voice_cloning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/voice_cloning/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/aishell3/vc0/local/tacotron2/aishell3.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/aishell3.py similarity index 92% rename from examples/aishell3/vc0/local/tacotron2/aishell3.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/aishell3.py index c53cf59dca3c8f8421636bb6101f988cb18f4b9f..b697ecf3602de5ada3f45709b0855154707c66d8 100644 --- a/examples/aishell3/vc0/local/tacotron2/aishell3.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/aishell3.py @@ -11,16 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import pickle from pathlib import Path import numpy as np from paddle.io import Dataset -from parakeet.frontend import Vocab -from parakeet.data import batch_text_id, batch_spec -from preprocess_transcription import _phones, _tones +from parakeet.data import batch_spec +from parakeet.data import batch_text_id +from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones +from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones +from parakeet.frontend import Vocab voc_phones = Vocab(sorted(list(_phones))) print("vocab_phones:\n", voc_phones) diff --git a/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py similarity index 85% rename from examples/aishell3/vc0/local/tacotron2/chinese_g2p.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py index e2437f06f8e5aa72f99a418a7030f5c124d498d9..f975522f1bf429ee77be2708e70a1cef97de8b81 100644 --- a/examples/aishell3/vc0/local/tacotron2/chinese_g2p.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py @@ -11,10 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import List +from typing import Tuple -from typing import List, Tuple -from pypinyin import lazy_pinyin, Style -from preprocess_transcription import split_syllable +from pypinyin import lazy_pinyin +from pypinyin import Style + +from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable def convert_to_pinyin(text: str) -> List[str]: diff --git a/examples/aishell3/vc0/local/tacotron2/config.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/config.py similarity index 99% rename from examples/aishell3/vc0/local/tacotron2/config.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/config.py index 440bdbd94d7b7e2e1623be3cf70dc0afc6eedb32..8d8c9c4e1b70022328818b9f60e541f6286420ff 100644 --- a/examples/aishell3/vc0/local/tacotron2/config.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode as CN _C = CN() diff --git a/examples/aishell3/vc0/local/tacotron2/extract_mel.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/extract_mel.py similarity index 92% rename from examples/aishell3/vc0/local/tacotron2/extract_mel.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/extract_mel.py index b7bafb86ce086783725b0ff96b00db462cd65df2..53daa3fcd9d58ba08150d460fce1d08497ecc421 100644 --- a/examples/aishell3/vc0/local/tacotron2/extract_mel.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/extract_mel.py @@ -11,19 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse import multiprocessing as mp from functools import partial from pathlib import Path import numpy as np -from parakeet.audio import AudioProcessor -from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude - import tqdm -from config import get_cfg_defaults +from parakeet.audio import AudioProcessor +from parakeet.audio.spec_normalizer import LogMagnitude +from parakeet.audio.spec_normalizer import NormalizerBase +from parakeet.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults def extract_mel(fname: Path, @@ -47,7 +46,7 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"): output_dir.mkdir(parents=True, exist_ok=True) p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length, - config.hop_length, config.n_mels, config.fmin, + config.hop_length, config.d_mels, config.fmin, config.fmax) n = LogMagnitude(1e-5) diff --git a/examples/aishell3/vc0/local/tacotron2/lexicon.txt b/parakeet/exps/voice_cloning/tacotron2_ge2e/lexicon.txt similarity index 100% rename from examples/aishell3/vc0/local/tacotron2/lexicon.txt rename to parakeet/exps/voice_cloning/tacotron2_ge2e/lexicon.txt diff --git a/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py similarity index 99% rename from examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py index fa74331b32f3d404fd6993669db064de6c7b97ad..ce117d420eda7eb736df26962912455524974671 100644 --- a/examples/aishell3/vc0/local/tacotron2/preprocess_transcription.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -from pathlib import Path -import re import pickle +import re +from pathlib import Path -import yaml import tqdm +import yaml zh_pattern = re.compile("[\u4e00-\u9fa5]") diff --git a/examples/aishell3/vc0/local/tacotron2/process_wav.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/process_wav.py similarity index 99% rename from examples/aishell3/vc0/local/tacotron2/process_wav.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/process_wav.py index 34d40897019b5f0da7e29616f833bd2fc6721e99..c1be0a37eb15045adc253b1e88692ee3f942dfc3 100644 --- a/examples/aishell3/vc0/local/tacotron2/process_wav.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/process_wav.py @@ -11,17 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import argparse -from pathlib import Path -from multiprocessing import Pool from functools import partial +from multiprocessing import Pool +from pathlib import Path -import numpy as np import librosa +import numpy as np import soundfile as sf -from tqdm import tqdm from praatio import tgio +from tqdm import tqdm def get_valid_part(fpath): diff --git a/examples/aishell3/vc0/local/tacotron2/train.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/train.py similarity index 93% rename from examples/aishell3/vc0/local/tacotron2/train.py rename to parakeet/exps/voice_cloning/tacotron2_ge2e/train.py index de018116859289b639283a820fb98bfa8d106b99..1a9bd8cb9887dada10bb25d883180a20addc1261 100644 --- a/examples/aishell3/vc0/local/tacotron2/train.py +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/train.py @@ -11,26 +11,27 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time -from pathlib import Path from collections import defaultdict +from pathlib import Path import numpy as np -from matplotlib import pyplot as plt - import paddle +from matplotlib import pyplot as plt from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from parakeet.data import dataset +from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3 +from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples +from parakeet.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults +from parakeet.models.tacotron2 import Tacotron2 +from parakeet.models.tacotron2 import Tacotron2Loss from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase -from parakeet.utils import display, mp_tools -from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss - -from config import get_cfg_defaults -from aishell3 import AiShell3, collate_aishell3_examples +from parakeet.utils import display +from parakeet.utils import mp_tools class Experiment(ExperimentBase): @@ -192,9 +193,9 @@ class Experiment(ExperimentBase): def setup_dataloader(self): args = self.args config = self.config - ljspeech_dataset = AiShell3(args.data) + aishell3_dataset = AiShell3(args.data) - valid_set, train_set = dataset.split(ljspeech_dataset, + valid_set, train_set = dataset.split(aishell3_dataset, config.data.valid_size) batch_fn = collate_aishell3_examples diff --git a/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py b/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py new file mode 100644 index 0000000000000000000000000000000000000000..8afd35b77ee619289a654e4c8807228e89b0b774 --- /dev/null +++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py @@ -0,0 +1,160 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +from matplotlib import pyplot as plt + +from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor +from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones +from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones +from parakeet.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence +from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder +from parakeet.models.tacotron2 import Tacotron2 +from parakeet.models.waveflow import ConditionalWaveFlow +from parakeet.utils import display + + +def voice_cloning(args): + # speaker encoder + p = SpeakerVerificationPreprocessor( + sampling_rate=16000, + audio_norm_target_dBFS=-30, + vad_window_length=30, + vad_moving_average_width=8, + vad_max_silence_length=6, + mel_window_length=25, + mel_window_step=10, + n_mels=40, + partial_n_frames=160, + min_pad_coverage=0.75, + partial_overlap_ratio=0.5) + print("Audio Processor Done!") + + speaker_encoder = LSTMSpeakerEncoder( + n_mels=40, num_layers=3, hidden_size=256, output_size=256) + speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) + speaker_encoder.eval() + print("GE2E Done!") + + synthesizer = Tacotron2( + vocab_size=68, + n_tones=10, + d_mels=80, + d_encoder=512, + encoder_conv_layers=3, + encoder_kernel_size=5, + d_prenet=256, + d_attention_rnn=1024, + d_decoder_rnn=1024, + attention_filters=32, + attention_kernel_size=31, + d_attention=128, + d_postnet=512, + postnet_kernel_size=5, + postnet_conv_layers=5, + reduction_factor=1, + p_encoder_dropout=0.5, + p_prenet_dropout=0.5, + p_attention_dropout=0.1, + p_decoder_dropout=0.1, + p_postnet_dropout=0.5, + d_global_condition=256, + use_stop_token=False, ) + synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path)) + synthesizer.eval() + print("Tacotron2 Done!") + + # vocoder + vocoder = ConditionalWaveFlow( + upsample_factors=[16, 16], + n_flows=8, + n_layers=8, + n_group=16, + channels=128, + n_mels=80, + kernel_size=[3, 3]) + vocoder.set_state_dict(paddle.load(args.waveflow_params_path)) + vocoder.eval() + print("WaveFlow Done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_dir = Path(args.input_dir) + + # 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。 + # 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。 + sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$" + phones, tones = convert_sentence(sentence) + phones = np.array( + [voc_phones.lookup(item) for item in phones], dtype=np.int64) + tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64) + phones = paddle.to_tensor(phones).unsqueeze(0) + tones = paddle.to_tensor(tones).unsqueeze(0) + + for name in os.listdir(input_dir): + utt_id = name.split(".")[0] + ref_audio_path = input_dir / name + mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) + print("mel_sequences: ", mel_sequences.shape) + with paddle.no_grad(): + embed = speaker_encoder.embed_utterance( + paddle.to_tensor(mel_sequences)) + print("embed shape: ", embed.shape) + utterance_embeds = paddle.unsqueeze(embed, 0) + outputs = synthesizer.infer( + phones, tones=tones, global_condition=utterance_embeds) + mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1]) + alignment = outputs["alignments"][0].numpy().T + display.plot_alignment(alignment) + plt.savefig(str(output_dir / (utt_id + ".png"))) + + with paddle.no_grad(): + wav = vocoder.infer(mel_input) + wav = wav.numpy()[0] + sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--ge2e_params_path", type=str, help="ge2e params path.") + parser.add_argument( + "--tacotron2_params_path", type=str, help="tacotron2 params path.") + parser.add_argument( + "--waveflow_params_path", type=str, help="waveflow params path.") + parser.add_argument( + "--device", type=str, default="gpu", help="device type to use.") + + parser.add_argument( + "--input-dir", + type=str, + help="input dir of *.wav, the sample rate will be resample to 16k.") + parser.add_argument("--output-dir", type=str, help="output dir.") + + args = parser.parse_args() + + paddle.set_device(args.device) + + voice_cloning(args) + + +if __name__ == "__main__": + main() diff --git a/parakeet/exps/waveflow/__init__.py b/parakeet/exps/waveflow/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/parakeet/exps/waveflow/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/ljspeech/voc0/local/waveflow/config.py b/parakeet/exps/waveflow/config.py similarity index 99% rename from examples/ljspeech/voc0/local/waveflow/config.py rename to parakeet/exps/waveflow/config.py index d009a2c82e0b9877891d116dc4f1a86983f90e89..869caa6a2c43e902f897011d637e80987b5f383a 100644 --- a/examples/ljspeech/voc0/local/waveflow/config.py +++ b/parakeet/exps/waveflow/config.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from yacs.config import CfgNode as CN _C = CN() diff --git a/examples/ljspeech/voc0/local/waveflow/ljspeech.py b/parakeet/exps/waveflow/ljspeech.py similarity index 97% rename from examples/ljspeech/voc0/local/waveflow/ljspeech.py rename to parakeet/exps/waveflow/ljspeech.py index afeba3915364771b0f1487f92d5be27963d3ebb0..ca18f400e22037c11cc7d070be082e32ba82fe5e 100644 --- a/examples/ljspeech/voc0/local/waveflow/ljspeech.py +++ b/parakeet/exps/waveflow/ljspeech.py @@ -11,14 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from pathlib import Path import numpy as np import pandas from paddle.io import Dataset -from parakeet.data.batch import batch_spec, batch_wav +from parakeet.data.batch import batch_spec +from parakeet.data.batch import batch_wav class LJSpeech(Dataset): diff --git a/examples/ljspeech/voc0/local/waveflow/preprocess.py b/parakeet/exps/waveflow/preprocess.py similarity index 98% rename from examples/ljspeech/voc0/local/waveflow/preprocess.py rename to parakeet/exps/waveflow/preprocess.py index 199081c002f89d1b8d90295d76273ef74f374c13..d4ec0de5da711ce405dccdd438da2ec299de3863 100644 --- a/examples/ljspeech/voc0/local/waveflow/preprocess.py +++ b/parakeet/exps/waveflow/preprocess.py @@ -11,20 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import argparse +import os from pathlib import Path -import tqdm -import numpy as np import librosa +import numpy as np import pandas as pd +import tqdm -from parakeet.datasets import LJSpeechMetaData from parakeet.audio import LogMagnitude - -from config import get_cfg_defaults +from parakeet.datasets import LJSpeechMetaData +from parakeet.exps.waveflow.config import get_cfg_defaults class Transform(object): diff --git a/examples/ljspeech/voc0/local/waveflow/synthesize.py b/parakeet/exps/waveflow/synthesize.py similarity index 97% rename from examples/ljspeech/voc0/local/waveflow/synthesize.py rename to parakeet/exps/waveflow/synthesize.py index e25cec3ee5e0a3e89138c697624875b3e8993902..4dd52514a08c2d89b356aa227d30ae4fb00c1086 100644 --- a/examples/ljspeech/voc0/local/waveflow/synthesize.py +++ b/parakeet/exps/waveflow/synthesize.py @@ -11,20 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import os import argparse +import os from pathlib import Path import numpy as np -import soundfile as sf import paddle +import soundfile as sf +from parakeet.exps.waveflow.config import get_cfg_defaults from parakeet.models.waveflow import ConditionalWaveFlow from parakeet.utils import layer_tools -from config import get_cfg_defaults - def main(config, args): paddle.set_device(args.device) diff --git a/examples/ljspeech/voc0/local/waveflow/train.py b/parakeet/exps/waveflow/train.py similarity index 92% rename from examples/ljspeech/voc0/local/waveflow/train.py rename to parakeet/exps/waveflow/train.py index 359670facdd034b3a9f7953a10a39631b78de559..ecfcbcaac8c57979e11d20555ae7bdb52c952fcf 100644 --- a/examples/ljspeech/voc0/local/waveflow/train.py +++ b/parakeet/exps/waveflow/train.py @@ -11,22 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import time import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader, DistributedBatchSampler +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from parakeet.data import dataset -from parakeet.models.waveflow import ConditionalWaveFlow, WaveFlowLoss -from parakeet.utils import mp_tools +from parakeet.exps.waveflow.config import get_cfg_defaults +from parakeet.exps.waveflow.ljspeech import LJSpeech +from parakeet.exps.waveflow.ljspeech import LJSpeechClipCollector +from parakeet.exps.waveflow.ljspeech import LJSpeechCollector +from parakeet.models.waveflow import ConditionalWaveFlow +from parakeet.models.waveflow import WaveFlowLoss from parakeet.training.cli import default_argument_parser from parakeet.training.experiment import ExperimentBase - -from config import get_cfg_defaults -from ljspeech import LJSpeech, LJSpeechClipCollector, LJSpeechCollector +from parakeet.utils import mp_tools class Experiment(ExperimentBase): diff --git a/parakeet/frontend/__init__.py b/parakeet/frontend/__init__.py index b8779b65b1c1d04b68fbc98b5465c485a4ad2117..64015435eefd7a8f1d3369a49cb0be7e10c8ec60 100644 --- a/parakeet/frontend/__init__.py +++ b/parakeet/frontend/__init__.py @@ -11,11 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .zh_normalization import * from .generate_lexicon import * from .normalizer import * from .phonectic import * from .punctuation import * from .tone_sandhi import * from .vocab import * +from .zh_normalization import * diff --git a/parakeet/frontend/normalizer/__init__.py b/parakeet/frontend/normalizer/__init__.py index 37fd5806d27c3b7523bbf146243de564b20ec02e..d1f2bfc537106f0a50e20864e5875be35fbd964c 100644 --- a/parakeet/frontend/normalizer/__init__.py +++ b/parakeet/frontend/normalizer/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.frontend.normalizer.normalizer import * from parakeet.frontend.normalizer.numbers import * diff --git a/parakeet/frontend/phonectic.py b/parakeet/frontend/phonectic.py index 23662254c15941e7c183e9805c2b396dbaf689b3..874c19795d70e4681db3f38944273d5d7859b3c2 100644 --- a/parakeet/frontend/phonectic.py +++ b/parakeet/frontend/phonectic.py @@ -17,9 +17,9 @@ from abc import abstractmethod from g2p_en import G2p from g2pM import G2pM -from parakeet.frontend.vocab import Vocab from parakeet.frontend.normalizer.normalizer import normalize from parakeet.frontend.punctuation import get_punctuations +from parakeet.frontend.vocab import Vocab # discard opencc untill we find an easy solution to install it on windows # from opencc import OpenCC diff --git a/parakeet/frontend/zh_frontend.py b/parakeet/frontend/zh_frontend.py index 8a0c1668ca5531c09bf67c34fb7934113fe1fd87..04ce235f7b3400a1c1c60c5786316d1958b6c91a 100644 --- a/parakeet/frontend/zh_frontend.py +++ b/parakeet/frontend/zh_frontend.py @@ -22,9 +22,9 @@ from g2pM import G2pM from pypinyin import lazy_pinyin from pypinyin import Style -from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer from parakeet.frontend.generate_lexicon import generate_lexicon from parakeet.frontend.tone_sandhi import ToneSandhi +from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer class Frontend(): diff --git a/parakeet/frontend/zh_normalization/__init__.py b/parakeet/frontend/zh_normalization/__init__.py index 77e10ebb47fb4cfa98f417a5af4a61f07c3950ef..1e49408043e59814353c39b952ca422926f37c92 100644 --- a/parakeet/frontend/zh_normalization/__init__.py +++ b/parakeet/frontend/zh_normalization/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from parakeet.frontend.zh_normalization.text_normlization import * diff --git a/parakeet/models/__init__.py b/parakeet/models/__init__.py index e943def76f843f4d0bb1ce9f4e2f2f98a2a50e26..4ce90896d2e5d9421eb2cc922bd8dca24d48819c 100644 --- a/parakeet/models/__init__.py +++ b/parakeet/models/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .fastspeech2 import * from .tacotron2 import * from .transformer_tts import * diff --git a/parakeet/models/fastspeech2/__init__.py b/parakeet/models/fastspeech2/__init__.py index 83479d6f6ccf6b02a87e23f684c1b63d4ff342c6..52925ef8ce732b87999097fff469b19a7dd8f719 100644 --- a/parakeet/models/fastspeech2/__init__.py +++ b/parakeet/models/fastspeech2/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .fastspeech2 import * from .fastspeech2_updater import * diff --git a/parakeet/models/fastspeech2/fastspeech2.py b/parakeet/models/fastspeech2/fastspeech2.py index 019979b95091e2594042788f32e9d7f9fca53a05..7c0e20bc2d67b51a5c31fbde45834483e00956f5 100644 --- a/parakeet/models/fastspeech2/fastspeech2.py +++ b/parakeet/models/fastspeech2/fastspeech2.py @@ -28,10 +28,10 @@ from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePr from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder -from parakeet.modules.tacotron2.decoder import Postnet from parakeet.modules.nets_utils import initialize from parakeet.modules.nets_utils import make_non_pad_mask from parakeet.modules.nets_utils import make_pad_mask +from parakeet.modules.tacotron2.decoder import Postnet class FastSpeech2(nn.Layer): diff --git a/parakeet/models/fastspeech2/fastspeech2_updater.py b/parakeet/models/fastspeech2/fastspeech2_updater.py index 789965f4d9b1ce7ceda790968e44ad1002c19cbe..ea23ec2af5f78e295c6f0a56abd8331751bb9295 100644 --- a/parakeet/models/fastspeech2/fastspeech2_updater.py +++ b/parakeet/models/fastspeech2/fastspeech2_updater.py @@ -14,6 +14,7 @@ import logging from paddle import distributed as dist + from parakeet.models.fastspeech2 import FastSpeech2Loss from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report diff --git a/parakeet/models/lstm_speaker_encoder.py b/parakeet/models/lstm_speaker_encoder.py index 3372b21296df1dd0f15a18c923c0a7e1eec347a1..f92fddc0e85c84e3112306d5298e4f76e703471f 100644 --- a/parakeet/models/lstm_speaker_encoder.py +++ b/parakeet/models/lstm_speaker_encoder.py @@ -106,10 +106,10 @@ class LSTMSpeakerEncoder(nn.Layer): def do_gradient_ops(self): for p in [self.similarity_weight, self.similarity_bias]: g = p._grad_ivar() - g[...] = g * 0.01 + g = g * 0.01 def inv_argmax(self, i, num): - return np.eye(1, num, i, dtype=np.int)[0] + return np.eye(1, num, i, dtype=int)[0] def loss(self, embeds): """ diff --git a/parakeet/models/parallel_wavegan/__init__.py b/parakeet/models/parallel_wavegan/__init__.py index 89403c0e0d4687ea8f3d3832f43b375a6bb9ba61..72322735b7c5719a12a60efaace31edde5d39aaa 100644 --- a/parakeet/models/parallel_wavegan/__init__.py +++ b/parakeet/models/parallel_wavegan/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .parallel_wavegan import * from .parallel_wavegan_updater import * diff --git a/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py index 68328fb3c9a5fbe387fe0b5ce502d830b92bbcb2..7bd59881d97ea813cbe2044152f9ede509c96e71 100644 --- a/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py +++ b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import logging from typing import Dict @@ -21,11 +20,12 @@ from paddle.io import DataLoader from paddle.nn import Layer from paddle.optimizer import Optimizer from paddle.optimizer.lr import LRScheduler +from timer import timer + from parakeet.training.extensions.evaluator import StandardEvaluator from parakeet.training.reporter import report from parakeet.training.updaters.standard_updater import StandardUpdater from parakeet.training.updaters.standard_updater import UpdaterState -from timer import timer logging.basicConfig( format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]') diff --git a/parakeet/models/speedyspeech/__init__.py b/parakeet/models/speedyspeech/__init__.py index 6d9b70887860c1435f5900fbb8d87fe904a0c7b7..abdac8da4dfd1c55b9ed4038e17602023bc3bbc5 100644 --- a/parakeet/models/speedyspeech/__init__.py +++ b/parakeet/models/speedyspeech/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .speedyspeech import * from .speedyspeech_updater import * diff --git a/parakeet/models/speedyspeech/speedyspeech_updater.py b/parakeet/models/speedyspeech/speedyspeech_updater.py index 3135d342646dbb97b83417f585b34c5c763379db..a17c93c7924ca42784b6dabeebeb908bc0f33030 100644 --- a/parakeet/models/speedyspeech/speedyspeech_updater.py +++ b/parakeet/models/speedyspeech/speedyspeech_updater.py @@ -17,6 +17,7 @@ import paddle from paddle import distributed as dist from paddle.fluid.layers import huber_loss from paddle.nn import functional as F + from parakeet.modules.losses import masked_l1_loss from parakeet.modules.losses import weighted_mean from parakeet.modules.ssim import ssim diff --git a/parakeet/models/transformer_tts/__init__.py b/parakeet/models/transformer_tts/__init__.py index 0456c3006a73f58122b32f062c9186975833b3a5..80a151ecaf28e7bfb54bb3ec62bf2f6c6480514b 100644 --- a/parakeet/models/transformer_tts/__init__.py +++ b/parakeet/models/transformer_tts/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .transformer_tts import * from .transformer_tts_updater import * diff --git a/parakeet/models/transformer_tts/transformer_tts.py b/parakeet/models/transformer_tts/transformer_tts.py index 42ab5f8673e00c5aa7d046eb5605448a96ac4a03..bb3674f383d70ee7bd6539196a8932c5715884ac 100644 --- a/parakeet/models/transformer_tts/transformer_tts.py +++ b/parakeet/models/transformer_tts/transformer_tts.py @@ -15,10 +15,11 @@ from typing import Dict from typing import Sequence from typing import Tuple + import numpy import paddle -from paddle import nn import paddle.nn.functional as F +from paddle import nn from typeguard import check_argument_types from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention @@ -27,13 +28,13 @@ from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncodin from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding from parakeet.modules.fastspeech2_transformer.encoder import Encoder from parakeet.modules.fastspeech2_transformer.mask import subsequent_mask +from parakeet.modules.nets_utils import initialize +from parakeet.modules.nets_utils import make_non_pad_mask +from parakeet.modules.nets_utils import make_pad_mask from parakeet.modules.style_encoder import StyleEncoder from parakeet.modules.tacotron2.decoder import Postnet from parakeet.modules.tacotron2.decoder import Prenet as DecoderPrenet from parakeet.modules.tacotron2.encoder import Encoder as EncoderPrenet -from parakeet.modules.nets_utils import initialize -from parakeet.modules.nets_utils import make_non_pad_mask -from parakeet.modules.nets_utils import make_pad_mask class TransformerTTS(nn.Layer): diff --git a/parakeet/models/transformer_tts/transformer_tts_updater.py b/parakeet/models/transformer_tts/transformer_tts_updater.py index 7e75a860132c9d101941916e84c9f8bc5324500b..4bec475859316bb580868a031489a82551d9a546 100644 --- a/parakeet/models/transformer_tts/transformer_tts_updater.py +++ b/parakeet/models/transformer_tts/transformer_tts_updater.py @@ -16,6 +16,7 @@ from typing import Sequence import paddle from paddle import distributed as dist + from parakeet.models.transformer_tts import GuidedMultiHeadAttentionLoss from parakeet.models.transformer_tts import TransformerTTSLoss from parakeet.training.extensions.evaluator import StandardEvaluator diff --git a/parakeet/modules/__init__.py b/parakeet/modules/__init__.py index fd38c0c40b71fc0ac33a7b45050d8783fb7d2692..664267895491c47ad0b3ecaaaae9412f3ce5110f 100644 --- a/parakeet/modules/__init__.py +++ b/parakeet/modules/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .attention import * from .conv import * from .geometry import * diff --git a/parakeet/modules/fastspeech2_transformer/decoder.py b/parakeet/modules/fastspeech2_transformer/decoder.py index a41a87c8c85f284269bcda71dc47f1ddca21ec64..0f09014f628669ee5f74b33116cb25af9daebee5 100644 --- a/parakeet/modules/fastspeech2_transformer/decoder.py +++ b/parakeet/modules/fastspeech2_transformer/decoder.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - # 暂时删除了 dyminic conv """Decoder definition.""" import logging diff --git a/parakeet/modules/fastspeech2_transformer/decoder_layer.py b/parakeet/modules/fastspeech2_transformer/decoder_layer.py index 53328866e01f19ef8ac99e5a684f7fcaa44e84e9..f968051e629ae818aea03411c9295463341d68f3 100644 --- a/parakeet/modules/fastspeech2_transformer/decoder_layer.py +++ b/parakeet/modules/fastspeech2_transformer/decoder_layer.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Decoder self-attention layer definition.""" - import paddle from paddle import nn + from parakeet.modules.layer_norm import LayerNorm diff --git a/parakeet/modules/fastspeech2_transformer/lightconv.py b/parakeet/modules/fastspeech2_transformer/lightconv.py index e5f59df18778c1b6587a7010093d13b67422ce70..061168848e75e1cd3a0255457d2292cd4a2f7b0a 100644 --- a/parakeet/modules/fastspeech2_transformer/lightconv.py +++ b/parakeet/modules/fastspeech2_transformer/lightconv.py @@ -12,11 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. """Lightweight Convolution Module.""" - import numpy import paddle -from paddle import nn import paddle.nn.functional as F +from paddle import nn from parakeet.modules.glu import GLU from parakeet.modules.masked_fill import masked_fill diff --git a/parakeet/modules/fastspeech2_transformer/mask.py b/parakeet/modules/fastspeech2_transformer/mask.py index 7dbd4d2fa5b8d795a8279a4f6ec17e037de7f73b..fd97b0049a25e309bb0edfcbbd98b40848701afa 100644 --- a/parakeet/modules/fastspeech2_transformer/mask.py +++ b/parakeet/modules/fastspeech2_transformer/mask.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """Mask module.""" - import paddle diff --git a/parakeet/modules/style_encoder.py b/parakeet/modules/style_encoder.py index aa94d4ba0e7c82bcd9f301f5c9ac1fdf0cb92b7f..fb27258ceeb566c966efca2892ffd2456060e854 100644 --- a/parakeet/modules/style_encoder.py +++ b/parakeet/modules/style_encoder.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """Style encoder of GST-Tacotron.""" - -from typeguard import check_argument_types from typing import Sequence import paddle from paddle import nn +from typeguard import check_argument_types + from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention diff --git a/parakeet/modules/tacotron2/decoder.py b/parakeet/modules/tacotron2/decoder.py index bf9d7e36444a46a0e89aaf753f19967a9eaef139..779fd0c62f87eb68c032abd5e19b88691f21054a 100644 --- a/parakeet/modules/tacotron2/decoder.py +++ b/parakeet/modules/tacotron2/decoder.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tacotron2 decoder related modules.""" - -import six import paddle.nn.functional as F +import six from paddle import nn diff --git a/parakeet/modules/tacotron2/encoder.py b/parakeet/modules/tacotron2/encoder.py index 1e22b769a7b22a4bf2bdf8a62fd7ce38603c4ad9..95f71d5e3f4a13da6ff4babd1962ddda6fea643d 100644 --- a/parakeet/modules/tacotron2/encoder.py +++ b/parakeet/modules/tacotron2/encoder.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tacotron2 encoder related modules.""" - -import six - import paddle +import six from paddle import nn diff --git a/parakeet/modules/transformer.py b/parakeet/modules/transformer.py index 696b12b6ea7f1a1f120c7cc536932561b1f31238..490458befc531a192f714c4ebd265ba70e4d43d5 100644 --- a/parakeet/modules/transformer.py +++ b/parakeet/modules/transformer.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. from paddle import nn -from parakeet.modules import attention as attn from paddle.nn import functional as F +from parakeet.modules import attention as attn + __all__ = [ "PositionwiseFFN", "TransformerEncoderLayer", diff --git a/parakeet/training/__init__.py b/parakeet/training/__init__.py index 277171dee194ade51f32b32ba7c9c20fdef8e25d..719e8445db528373bf3999e81e54b00ac41a1935 100644 --- a/parakeet/training/__init__.py +++ b/parakeet/training/__init__.py @@ -11,6 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from .cli import * from .experiment import * diff --git a/parakeet/training/optimizer.py b/parakeet/training/optimizer.py index 1f5496c09cdc2844c0fb7600eb9d8b185fc07fad..c6a6944d10758e2c1d0e6fae79ae1db995b0ac32 100644 --- a/parakeet/training/optimizer.py +++ b/parakeet/training/optimizer.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle optim_classes = dict( diff --git a/parakeet/utils/__init__.py b/parakeet/utils/__init__.py index 9811f201f334a579f6088da3e68f485408b42f44..ce3a4ef60ce05e42600cf9f0c5e3ba0fb8d44f5e 100644 --- a/parakeet/utils/__init__.py +++ b/parakeet/utils/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from . import checkpoint from . import display from . import layer_tools diff --git a/parakeet/utils/profiler.py b/parakeet/utils/profiler.py index e64afd6a0d8cfd860920916acfbf168d58dfff2d..2bbeb02d19f2c865f43477433cd7870a22bd3779 100644 --- a/parakeet/utils/profiler.py +++ b/parakeet/utils/profiler.py @@ -11,8 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import sys + import paddle # A global variable to record the number of calling times for profiler diff --git a/setup.py b/setup.py index bd982129c83b13715599a3ede2d4ebc6759269c2..be17e0a4f212ef2bc0d1bad0b67c995879edbeb1 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,6 @@ import contextlib import inspect import io import os -import re import subprocess as sp import sys from pathlib import Path @@ -84,7 +83,7 @@ def _post_install(install_lib_dir): tools_extrs_dir = HERE / 'tools/extras' with pushd(tools_extrs_dir): print(os.getcwd()) - check_call(f"./install_autolog.sh") + check_call("./install_autolog.sh") print("autolog install.") # ctcdecoder diff --git a/tests/benchmark/pwgan/README.md b/tests/benchmark/pwgan/README.md index 3d2267aebcbcdaeb6c9b66767d19af6b05b777e0..369f4b74f6dc54bb6386cbd4a918f37f5fc0ee92 100644 --- a/tests/benchmark/pwgan/README.md +++ b/tests/benchmark/pwgan/README.md @@ -4,8 +4,8 @@ ``` 即可运行. 执行逻辑: -1. cd 到 ../../../ (也就是 Parakeet 目录) +1. cd 到 ../../../ (也就是 Deepspeech 目录) 2. 安装 parakeet 所需依赖 3. 从 bos 下载数据集并解压缩 -4. 预处理数据集为训练 pwg 所需格式,保存到 Parakeet/dump 文件夹底下 +4. 预处理数据集为训练 pwg 所需格式,保存到 Deepspeech/dump 文件夹底下 5. 按照不同的参数执行 run_benchmark.sh 脚本 diff --git a/tests/benchmark/pwgan/run_all.sh b/tests/benchmark/pwgan/run_all.sh index e26db31786c74cce4481c8b6757ca21dc309091c..e4bb17f8cd1bdf8a571d8aec5af0d9d6e61fdaf8 100755 --- a/tests/benchmark/pwgan/run_all.sh +++ b/tests/benchmark/pwgan/run_all.sh @@ -10,6 +10,9 @@ cd ../../../ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then sudo apt-get install libsndfile1 pip install -e . + pushd examples/csmsc/voc1 + source path.sh + popd fi # 2 拷贝该模型需要数据、预训练模型 # 下载 baker 数据集到 home 目录下并解压缩到 home 目录下 @@ -22,15 +25,14 @@ fi # 数据预处理 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - python examples/GANVocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml - python utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" - python examples/GANVocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy - python examples/GANVocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy - python examples/GANVocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy + python3 parakeet/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/csmsc/voc1/conf/default.yaml + python3 utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats" + python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy + python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy + python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy fi # 3 批量运行(如不方便批量,1,2需放到单个模型中) if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - model_mode_list=(pwg) fp_item_list=(fp32) # 满 bs 是 26 @@ -40,7 +42,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then for bs_item in ${bs_item_list[@]}; do echo "index is speed, 1gpus, begin, ${model_name}" run_mode=sp - CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/PWGAN/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min) + CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min) sleep 60 echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}" run_mode=mp diff --git a/tests/benchmark/pwgan/run_benchmark.sh b/tests/benchmark/pwgan/run_benchmark.sh index bcdccccf43e72cad8aa9a74f3e8f5bff687c329f..e961e442b49c4e86b493c508a40b25fe91a14d69 100755 --- a/tests/benchmark/pwgan/run_benchmark.sh +++ b/tests/benchmark/pwgan/run_benchmark.sh @@ -24,13 +24,13 @@ function _train(){ --max-iter=${max_iter} --train-metadata=dump/train/norm/metadata.jsonl \ --dev-metadata=dump/dev/norm/metadata.jsonl \ - --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml \ + --config=examples/csmsc/voc1/conf/default.yaml \ --output-dir=exp/default \ --run-benchmark=true" case ${run_mode} in - sp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;; - mp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}" + sp) train_cmd="python3 parakeet/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;; + mp) train_cmd="python3 parakeet/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}" log_parse_file="mylog/workerlog.0" ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac diff --git a/tests/chains/speedyspeech/lite_train_infer.sh b/tests/chains/speedyspeech/lite_train_infer.sh old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/prepare.sh b/tests/chains/speedyspeech/prepare.sh old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt index 98026241820c6b8f933c64bf72cc2174c256958c..ad3420521d7c5e8163f634d3a36c5b2dfdb2f5fa 100644 --- a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt +++ b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt @@ -13,7 +13,7 @@ null:null null:null ## trainer:norm_train -norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True +norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True null:null null:null null:null @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt +eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt null:null ## ===========================infer_params=========================== @@ -37,7 +37,7 @@ null:null null:null null:null null:null -inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt null:null null:null null:null diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt index e821183aa80b8397e1c0882f1e5af5f1632fd52a..eaad714debd4c47588711f8b4eaabb053e1772b1 100644 --- a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt +++ b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt @@ -13,7 +13,7 @@ null:null null:null ## trainer:norm_train -norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True +norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True null:null null:null null:null @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt +eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt null:null ## ===========================infer_params=========================== @@ -37,7 +37,7 @@ null:null null:null null:null null:null -inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt --use_gpu:True null:null null:null diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt index 7c5171197d98c08ce7f95fd976907f7d141d0826..236805fc5fcd677f50144a13848ca840f61cc6de 100644 --- a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt +++ b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt @@ -13,7 +13,7 @@ null:null null:null ## trainer:norm_train -norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True +norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True null:null null:null null:null @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt +eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt null:null ## ===========================infer_params=========================== @@ -37,7 +37,7 @@ null:null null:null null:null null:null -inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt null:null null:null null:null diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt index 9a6c611e12d0e196ac06665e46adddfde576ef22..9caeb18e7cdc86d766e3d0489d5407b424c127f7 100644 --- a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt +++ b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt @@ -13,7 +13,7 @@ null:null null:null ## trainer:norm_train -norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True +norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True null:null null:null null:null @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt +eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt null:null ## ===========================infer_params=========================== @@ -37,7 +37,7 @@ null:null null:null null:null null:null -inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt +inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt null:null null:null null:null diff --git a/tests/chains/speedyspeech/test.sh b/tests/chains/speedyspeech/test.sh old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/whole_train_infer.sh b/tests/chains/speedyspeech/whole_train_infer.sh old mode 100644 new mode 100755 diff --git a/utils/json2trn.py b/utils/json2trn.py index 873fde4f7c1fed6e100aaaf2d79030c4e0a441ad..4adfa491db7c5965f25f89b902ebedc3f4a193e2 100755 --- a/utils/json2trn.py +++ b/utils/json2trn.py @@ -4,7 +4,6 @@ # 2018 Xuankai Chang (Shanghai Jiao Tong University) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import argparse -import json import logging import sys