未验证 提交 e3954624 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #932 from yt605155624/merge_parakeet

[tts] refactor parakeet example
......@@ -355,7 +355,6 @@ if not hasattr(paddle.Tensor, 'tolist'):
"register user tolist to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'tolist', tolist)
########### hack paddle.nn #############
from paddle.nn import Layer
from typing import Optional
......@@ -506,5 +505,3 @@ if not hasattr(paddle.nn, 'LayerDict'):
logger.debug(
"register user LayerDict to paddle.nn, remove this when fixed!")
setattr(paddle.nn, 'LayerDict', LayerDict)
......@@ -12,12 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
import json
from pathlib import Path
import jsonlines
import paddle
import yaml
from yacs.config import CfgNode
from .beam_search import BatchBeamSearch
......@@ -79,8 +75,7 @@ def recog_v2(args):
sort_in_input_length=False,
preprocess_conf=confs.collator.augmentation_config
if args.preprocess_conf is None else args.preprocess_conf,
preprocess_args={"train": False},
)
preprocess_args={"train": False}, )
if args.rnnlm:
lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
......@@ -113,8 +108,7 @@ def recog_v2(args):
ctc=args.ctc_weight,
lm=args.lm_weight,
ngram=args.ngram_weight,
length_bonus=args.penalty,
)
length_bonus=args.penalty, )
beam_search = BeamSearch(
beam_size=args.beam_size,
vocab_size=len(char_list),
......@@ -123,8 +117,7 @@ def recog_v2(args):
sos=model.sos,
eos=model.eos,
token_list=char_list,
pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
)
pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )
# TODO(karita): make all scorers batchfied
if args.batchsize == 1:
......@@ -171,7 +164,8 @@ def recog_v2(args):
logger.info(f'feat: {feat.shape}')
enc = model.encode(paddle.to_tensor(feat).to(dtype))
logger.info(f'eout: {enc.shape}')
nbest_hyps = beam_search(x=enc,
nbest_hyps = beam_search(
x=enc,
maxlenratio=args.maxlenratio,
minlenratio=args.minlenratio)
nbest_hyps = [
......@@ -183,9 +177,8 @@ def recog_v2(args):
item = new_js[name]['output'][0] # 1-best
ref = item['text']
rec_text = item['rec_text'].replace('▁',
' ').replace('<eos>',
'').strip()
rec_text = item['rec_text'].replace('▁', ' ').replace(
'<eos>', '').strip()
rec_tokenid = list(map(int, item['rec_tokenid'].split()))
f.write({
"utt": name,
......
......@@ -21,7 +21,7 @@ from distutils.util import strtobool
import configargparse
import numpy as np
from .recog import recog_v2
from deepspeech.decoders.recog import recog_v2
def get_parser():
......@@ -359,7 +359,7 @@ def main(args):
if args.num_encs == 1:
# Experimental API that supports custom LMs
if args.api == "v2":
from deepspeech.decoders.recog import recog_v2
recog_v2(args)
else:
raise ValueError("Only support --api v2")
......
......@@ -11,9 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union
import paddle
from paddle import nn
from typing import Union
from paddle.nn import functional as F
from typeguard import check_argument_types
......
# Aishell3
* tts0 - fastspeech2
* vc0 - tactron2 voice clone
* tts0 - Tactron2
* tts1 - TransformerTTS
* tts2 - SpeedySpeech
* tts3 - FastSpeech2
* voc0 - WaveFlow
* voc1 - Parallel WaveGAN
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
* vc0 - Tactron2 Voice Clone with GE2E
......@@ -18,12 +18,23 @@ tar zxvf data_aishell3.tgz -C data_aishell3
### Get MFA result of AISHELL-3 and Extract it
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./preprocess.sh
./run.sh
```
### Preprocess the dataset
```bash
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
```text
......@@ -47,10 +58,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
Here's the complete help message.
```text
......@@ -85,20 +96,8 @@ optional arguments:
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_aishell3_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_96400.pdz # model parameters and optimizer states
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
## Synthesize
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
......@@ -111,9 +110,9 @@ pwg_baker_ckpt_0.4
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
......@@ -153,19 +152,19 @@ optional arguments:
--device DEVICE device type to use.
--verbose VERBOSE verbose.
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
usage: multi_spk_synthesize_e2e.py [-h]
[--fastspeech2-config FASTSPEECH2_CONFIG]
[--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
[--fastspeech2-stat FASTSPEECH2_STAT]
[--pwg-config PWG_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT]
[--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
[--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT] [--text TEXT]
[--output-dir OUTPUT_DIR] [--device DEVICE]
[--verbose VERBOSE]
......@@ -204,24 +203,38 @@ optional arguments:
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_aishell3_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_96400.pdz # model parameters and optimizer states
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
--fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
--fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
--speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt
```
## Future work
A multi-speaker vocoder is needed.
......@@ -3,7 +3,7 @@
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./aishell3_alignment_tone \
--output durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../preprocess.py \
python3 ${BIN_DIR}/preprocess.py \
--dataset=aishell3 \
--rootdir=~/datasets/data_aishell3/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True
fi
......@@ -46,7 +46,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo "Normalize ..."
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \
python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test \
--output-dir=${train_output_path}/test \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_96400.pdz \
python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--output-dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=${train_output_path}/test_e2e \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
#!/bin/bash
python3 ../train.py \
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=2 \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_482.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# Tacotron2 + AISHELL-3 Voice Cloning
This example contains code used to train a [Tacotron2 ](https://arxiv.org/abs/1712.05884) model with [AISHELL-3](http://www.aishelltech.com/aishell_3). The trained model can be used in Voice Cloning Task, We refer to the model structure of [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) . The general steps are as follows:
1. Speaker Encoder: We use a Speaker Verification to train a speaker encoder. Datasets used in this task are different from those used in Tacotron2, because the transcriptions are not needed, we use more datasets, refer to [ge2e](../../other/ge2e).
2. Synthesizer: Then, we use the trained speaker encoder to generate utterance embedding for each sentence in AISHELL-3. This embedding is a extra input of Tacotron2 which will be concated with encoder outputs.
3. Vocoder: We use WaveFlow as the neural Vocoder, refer to [waveflow](../../ljspeech/voc0).
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./alignment`.
Assume the path to the pretrained ge2e model is `ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000`
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. start a voice cloning inference.
```bash
./run.sh
```
### Preprocess the dataset
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path}
```
#### generate utterance embedding
Use pretrained GE2E (speaker encoder) to generate utterance embedding for each sentence in AISHELL-3, which has the same file structure with wav files and the format is `.npy`.
```bash
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ge2e/inference.py \
--input=${input} \
--output=${preprocess_path}/embed \
--device="gpu" \
--checkpoint_path=${ge2e_ckpt_path}
fi
```
The computing time of utterance embedding can be x hours.
#### process wav
There are silence in the edge of AISHELL-3's wavs, and the audio amplitude is very small, so, we need to remove the silence and normalize the audio. You can the silence remove method based on volume or energy, but the effect is not very good, We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get the alignment of text and speech, then utilize the alignment results to remove the silence.
We use Montreal Force Aligner 1.0. The label in aishell3 include pinyin,so the lexicon we provided to MFA is pinyin rather than Chinese characters. And the prosody marks(`$` and `%`) need to be removed. You shoud preprocess the dataset into the format which MFA needs, the texts have the same name with wavs and have the suffix `.lab`.
We use [lexicon.txt](./lexicon.txt) as the lexicon.
You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
```bash
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Process wav ..."
python3 ${BIN_DIR}/process_wav.py \
--input=${input}/wav \
--output=${preprocess_path}/normalized_wav \
--alignment=${alignment}
fi
```
#### preprocess transcription
We revert the transcription into `phones` and `tones`. It is worth noting that our processing here is different from that used for MFA, we separated the tones. This is a processing method, of course, you can only segment initials and vowels.
```bash
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/preprocess_transcription.py \
--input=${input} \
--output=${preprocess_path}
fi
```
The default input is `~/datasets/data_aishell3/train`,which contains `label_train-set.txt`, the processed results are `metadata.yaml` and `metadata.pickle`. the former is a text format for easy viewing, and the latter is a binary format for direct reading.
#### extract mel
```python
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/extract_mel.py \
--input=${preprocess_path}/normalized_wav \
--output=${preprocess_path}/mel
fi
```
### Train the model
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
```
Our model remve stop token prediction in Tacotron2, because of the problem of extremely unbalanced proportion of positive and negative samples of stop token prediction, and it's very sensitive to the clip of audio silence. We use the last symbol from the highest point of attention to the encoder side as the termination condition.
In addition, in order to accelerate the convergence of the model, we add `guided attention loss` to induce the alignment between encoder and decoder to show diagonal lines faster.
### Infernece
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output}
```
## Pretrained Model
[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
#!/bin/bash
stage=0
stop_stage=100
input=$1
preprocess_path=$2
alignment=$3
ge2e_ckpt_path=$4
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../../ge2e/inference.py \
--input=${input} \
--output=${preprocess_path}/embed \
--device="gpu" \
--checkpoint_path=${ge2e_ckpt_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Process wav ..."
python3 ${BIN_DIR}/process_wav.py \
--input=${input}/wav \
--output=${preprocess_path}/normalized_wav \
--alignment=${alignment}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/preprocess_transcription.py \
--input=${input} \
--output=${preprocess_path}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/extract_mel.py \
--input=${preprocess_path}/normalized_wav \
--output=${preprocess_path}/mel
fi
## Tacotron2 + AISHELL-3 数据集训练语音克隆模型
本实验的内容是利用 AISHELL-3 数据集和 Tacotron 2 模型进行语音克隆任务,使用的模型大体结构和论文 [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) 相同。大致步骤如下:
1. Speaker Encoder: 我们使用了一个 Speaker Verification 任务训练一个 speaker encoder。这部分任务所用的数据集和训练 Tacotron 2 的数据集不同,因为不需要 transcription 的缘故,我们使用了较多的训练数据,可以参考实现 [ge2e](../ge2e)
2. Synthesizer: 然后使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 这个 Embedding 作为 Tacotron 模型中的一个额外输入和 encoder outputs 拼接在一起。
3. Vocoder: 我们使用的声码器是 WaveFlow,参考实验 [waveflow](../waveflow).
## 数据处理
### utterance embedding 的生成
使用训练好的 speaker encoder 为 AISHELL-3 数据集中的每个句子生成对应的 utterance embedding. 以和音频文件夹同构的方式存储。存储格式是 `.npy` 文件。
首先 cd 到 [ge2e](../ge2e) 文件夹。下载训练好的 [模型](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip),然后运行脚本生成每个句子的 utterance embedding.
```bash
python inference.py --input=<intput> --output=<output> --device="gpu" --checkpoint_path=<pretrained checkpoint>
```
其中 input 是只包含音频文件夹的文件。这里可以用 `~/datasets/aishell3/train/wav`,然后 output 是用于存储 utterance embed 的文件夹,这里可以用 `~/datasets/aishell3/train/embed`。Utterance embedding 会以和音频文件夹相同的文件结构存储,格式为 `.npy`.
utterance embedding 的计算可能会用几个小时的时间,请耐心等待。
### 音频处理
因为 AISHELL-3 数据集前后有一些空白,静音片段,而且语音幅值很小,所以我们需要进行空白移除和音量规范化。空白移除可以简单的使用基于音量或者能量的方法,但是效果不是很好,对于不同的句子很难取到一个一致的阈值。我们使用的是先利用 Force Aligner 进行文本和语音的对齐。然后根据对齐结果截除空白。
我们使用的工具是 Montreal Force Aligner 1.0. 因为 aishell 的标注包含拼音标注,所以我们提供给 Montreal Force Aligner 的是拼音 transcription 而不是汉字 transcription. 而且需要把其中的韵律标记(`$``%`)去除,并且处理成 Montreal Force Alinger 所需要的文件形式。和音频同名的文本文件,扩展名为 `.lab`.
此外还需要准备词典文件。其中包含把拼音序列转换为 phone 序列的映射关系。在这里我们只做声母和韵母的切分,而声调则归为韵母的一部分。我们使用的[词典文件](./lexicon.txt)可以下载。
准备好之后运行训练和对齐。首先下载 [Montreal Force Aligner 1.0](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/tag/v1.0.1).下载之后解压即可运行。cd 到其中的 bin 文件夹运行命令,即可进行训练和对齐。前三个命令行参数分别是音频文件夹的路径,词典路径和对齐文件输出路径。可以通过`-o` 传入训练得到的模型保存路径。
```bash
./mfa_train_and_align \
~/datasets/aishell3/train/wav \
lexicon.txt \
~/datasets/aishell3/train/alignment \
-o aishell3_model \
-v
```
因为训练和对齐的时间比较长。我们提供了对齐后的 [alignment 文件](https://paddlespeech.bj.bcebos.com/Parakeet/alignment_aishell3.tar.gz),其中每个句子对应的文件为 `.TextGrid` 格式的文本。
得到了对齐文件之后,可以运行 `process_wav.py` 脚本来处理音频。
```bash
python process_wav.py --input=<input> --output=<output> --alignment=<alignment>
```
默认 input, output, alignment 分别是 `~/datasets/aishell3/train/wav`, `~/datasets/aishell3/train/normalized_wav`, `~/datasets/aishell3/train/alignment`.
处理结束后,会将处理好的音频保存在 `<output>` 文件夹中。
### 转录文本处理
把文本转换成为 phone 和 tone 的形式,并存储起来。值得注意的是,这里我们的处理和用于 montreal force aligner 的不一样。我们把声调分了出来。这是一个处理方式,当然也可以只做声母和韵母的切分。
运行脚本处理转录文本。
```bash
python preprocess_transcription.py --input=<input> --output=<output>
```
默认的 input 是 `~/datasets/aishell3/train`,其中会包含 `label_train-set.txt` 文件,处理后的结果会 `metadata.yaml``metadata.pickle`. 前者是文本格式,方便查看,后者是二进制格式,方便直接读取。
### mel 频谱提取
对处理后的音频进行 mel 频谱的提取,并且以和音频文件夹同构的方式存储,存储格式是 `.npy` 文件。
```python
python extract_mel.py --input=<intput> --output=<output>
```
input 是处理后的音频所在的文件夹,output 是输出频谱的文件夹。
## 训练
运行脚本训练。
```python
python train.py --data=<data> --output=<output> --device="gpu"
```
我们的模型去掉了 tacotron2 模型中的 stop token prediction。因为实践中由于 stop token prediction 是一个正负样例比例极不平衡的问题,每个句子可能有几百帧对应负样例,只有一帧正样例,而且这个 stop token prediction 对音频静音的裁切十分敏感。我们转用 attention 的最高点到达 encoder 侧的最后一个符号为终止条件。
另外,为了加速模型的收敛,我们加上了 guided attention loss, 诱导 encoder-decoder 之间的 alignment 更快地呈现对角线。
可以使用 visualdl 查看训练过程的 log。
```bash
visualdl --logdir=<output> --host=$HOSTNAME
```
示例 training loss / validation loss 曲线如下。
![train](./images/train.png)
![valid](./images/valid.png)
<img src="images/alignment-step2000.png" alt="alignment-step2000" style="zoom:50%;" />
大约从训练 2000 步左右就从 validation 过程中产出的 alignement 中可以观察到模糊的对角线。随着训练步数增加,对角线会更加清晰。但因为 validation 也是以 teacher forcing 的方式进行的,所以要在真正的 auto regressive 合成中产出的 alignment 中观察到对角线,需要更长的时间。
## 预训练模型
预训练模型下载链接。[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip).
## 使用
本实验包含了一个简单的使用示例,用户可以替换作为参考的声音以及文本,用训练好的模型来合成语音。使用方式参考 [notebook](./voice_cloning.ipynb) 上的使用说明。
因为 它太大了无法显示 source diff 。你可以改为 查看blob
#!/bin/bash
preprocess_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--data=${preprocess_path} \
--output=${train_output_path} \
--device="gpu"
\ No newline at end of file
#!/bin/bash
ge2e_params_path=$1
tacotron2_params_path=$2
waveflow_params_path=$3
vc_input=$4
vc_output=$5
python3 ${BIN_DIR}/voice_cloning.py \
--ge2e_params_path=${ge2e_params_path} \
--tacotron2_params_path=${tacotron2_params_path} \
--waveflow_params_path=${waveflow_params_path} \
--input-dir=${vc_input} \
--output-dir=${vc_output}
\ No newline at end of file
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=voice_cloning/tacotron2_ge2e
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0
stage=0
stop_stage=100
input=~/datasets/data_aishell3/train
preprocess_path=dump
alignment=./alignment
# not include ".pdparams" here
ge2e_ckpt_path=./ge2e_ckpt_0.3/step-3000000
train_output_path=output
# include ".pdparams" here
ge2e_params_path=${ge2e_ckpt_path}.pdparams
tacotron2_params_path=${train_output_path}/checkpoints/step-1000.pdparams
# pretrained model
# tacotron2_params_path=./tacotron2_aishell3_ckpt_0.3/step-450000.pdparams
waveflow_params_path=./waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams
vc_input=ref_audio
vc_output=syn_audio
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
CUDA_VISIBLE_DEVICES=${gpus} ./local/preprocess.sh ${input} ${preprocess_path} ${alignment} ${ge2e_ckpt_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_cloning.sh ${ge2e_params_path} ${tacotron2_params_path} ${waveflow_params_path} ${vc_input} ${vc_output} || exit -1
fi
# CSMSC
* tts0 - Tactron2
* tts1 - TransformerTTS
* tts2 - SpeedySpeech
* tts3 - FastSpeech2
* voc0 - WaveFlow
* voc1 - Parallel WaveGAN
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
# Speedyspeech with CSMSC
This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).
## Dataset
......@@ -10,12 +9,23 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
## Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
6. inference using static model.
```bash
./preprocess.sh
./run.sh
```
### Preprocess the dataset
```bash
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -37,13 +47,12 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
```
Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
......@@ -81,20 +90,7 @@ optional arguments:
6. `--phones-dict` is the path of the phone vocabulary file.
7. `--tones-dict` is the path of the tone vocabulary file.
## Pretrained Model
Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
SpeedySpeech checkpoint contains files listed below.
```text
speedyspeech_nosil_baker_ckpt_0.5
├── default.yaml # default config used to train speedyspeech
├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech
├── phone_id_map.txt # phone vocabulary file when training speedyspeech
├── snapshot_iter_11400.pdz # model parameters and optimizer states
└── tone_id_map.txt # tone vocabulary file when training speedyspeech
```
## Synthesize
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
......@@ -107,9 +103,9 @@ pwg_baker_ckpt_0.4
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
......@@ -152,9 +148,9 @@ optional arguments:
--device DEVICE device type to use
--verbose VERBOSE verbose
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
......@@ -203,21 +199,42 @@ optional arguments:
4. `--output-dir` is the directory to save synthesized audio files.
5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
6. `--phones-dict` is the path of the phone vocabulary file.
7. `--tones-dict` is the path of the tone vocabulary file.
7. `--phones-dict` is the path of the phone vocabulary file.
8. `--tones-dict` is the path of the tone vocabulary file.
### Inference
After Synthesize, we will get static models of speedyspeech and pwgan in `${train_output_path}/inference`.
`./local/inference.sh` calls `${BIN_DIR}/inference.py`, which provides a paddle static model inference example for speedyspeech + pwgan synthesize.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
```
You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
## Pretrained Model
Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
SpeedySpeech checkpoint contains files listed below.
```text
speedyspeech_nosil_baker_ckpt_0.5
├── default.yaml # default config used to train speedyspeech
├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech
├── phone_id_map.txt # phone vocabulary file when training speedyspeech
├── snapshot_iter_11400.pdz # model parameters and optimizer states
└── tone_id_map.txt # tone vocabulary file when training speedyspeech
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
python3 ${BIN_DIR}/synthesize_e2e.py \
--speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
--speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
--speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--inference-dir=exp/default/inference \
--device="gpu" \
......
#!/bin/bash
python3 inference.py \
--inference-dir=exp/default/inference \
--text=../sentences.txt \
--output-dir=exp/default/pd_infer_out \
train_output_path=$1
python3 ${BIN_DIR}/inference.py \
--inference-dir=${train_output_path}/inference \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=${train_output_path}/pd_infer_out \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt
#!/bin/bash
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,17 +12,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./baker_alignment_tone \
--output=durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Extract features ..."
python3 ../preprocess.py \
python3 ${BIN_DIR}/preprocess.py \
--dataset=baker \
--rootdir=~/datasets/BZNSYP/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True \
--use-relative-path=True
......@@ -38,7 +39,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone/tone to id, dev and test should use train's stats
echo "Normalize ..."
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--stats=dump/train/feats_stats.npy \
......@@ -46,7 +47,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--stats=dump/train/feats_stats.npy \
......@@ -54,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--tones-dict=dump/tone_id_map.txt \
--use-relative-path=True
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--stats=dump/train/feats_stats.npy \
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--speedyspeech-config=conf/default.yaml \
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
python3 ${BIN_DIR}/synthesize.py \
--speedyspeech-config=${config_path} \
--speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--speedyspeech-stat=dump/train/feats_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test \
--inference-dir=exp/default/inference \
--output-dir=${train_output_path}/test \
--inference-dir=${train_output_path}/inference \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
--device="gpu"
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python synthesize_e2e.py \
--speedyspeech-config=conf/default.yaml \
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
python3 ${BIN_DIR}/synthesize_e2e.py \
--speedyspeech-config=${config_path} \
--speedyspeech-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--speedyspeech-stat=dump/train/feats_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--output-dir=exp/default/test_e2e \
--inference-dir=exp/default/inference \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=${train_output_path}/test_e2e \
--inference-dir=${train_output_path}/inference \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt
#!/bin/bash
python ../train.py \
config_path=$1
train_output_path=$2
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=2 \
--phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \
......
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=speedyspeech
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
\ No newline at end of file
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_76.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi
......@@ -9,13 +9,22 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./preprocess.sh
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -40,11 +49,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
`./local/train.sh` calls `${BIN_DIR}/train.py`.
Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
......@@ -78,18 +87,7 @@ optional arguments:
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_baker_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_76000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
## Synthesize
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
......@@ -102,9 +100,9 @@ pwg_baker_ckpt_0.4
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
......@@ -144,9 +142,9 @@ optional arguments:
--device DEVICE device type to use.
--verbose VERBOSE verbose.
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
......@@ -191,18 +189,31 @@ optional arguments:
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_baker_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_76000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
python3 ${BIN_DIR}/synthesize_e2e.py \
--fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
......
......@@ -3,7 +3,7 @@
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./baker_alignment_tone \
--output=durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../preprocess.py \
python3 ${BIN_DIR}/preprocess.py \
--dataset=baker \
--rootdir=~/datasets/BZNSYP/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True
fi
......@@ -46,7 +46,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo "Normalize ..."
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_76000.pdz \
python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test \
--output-dir=${train_output_path}/test \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_153.pdz \
python3 ${BIN_DIR}/synthesize_e2e.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--output-dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=${train_output_path}/test_e2e \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
python3 ../train.py \
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=1 \
--phones-dict=dump/phone_id_map.txt
\ No newline at end of file
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_153.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# Parallel WaveGAN with CSMSC
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
## Preprocess the dataset
## Dataset
### Download and Extract the datasaet
Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.
......@@ -8,12 +8,21 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`.
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./preprocess.sh
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -30,17 +39,15 @@ dump
├── raw
└── feats_stats.npy
```
The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
`./local/train.sh` calls `${BIN_DIR}/train.py`.
Here's the complete help message.
```text
......@@ -86,25 +93,10 @@ benchmark:
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
## Pretrained Models
Pretrained models can be downloaded here:
1. Parallel WaveGAN checkpoint. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip), which is used as a vocoder in the end-to-end inference script.
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_baker_ckpt_0.4
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
## Synthesize
`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
### Synthesize
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
......@@ -127,10 +119,21 @@ optional arguments:
```
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
## Pretrained Models
Pretrained models can be downloaded here [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip).
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_baker_ckpt_0.4
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
......@@ -3,7 +3,7 @@
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./baker_alignment_tone \
--output=durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../../preprocess.py \
python3 ${BIN_DIR}/../preprocess.py \
--rootdir=~/datasets/BZNSYP/ \
--dataset=baker \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--cut-sil=True \
--num-cpu=20
fi
......@@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--stats=dump/train/feats_stats.npy
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--stats=dump/train/feats_stats.npy
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--stats=dump/train/feats_stats.npy
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--config=conf/default.yaml \
--checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\
python3 ${BIN_DIR}/synthesize.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test
--output-dir=${train_output_path}/test
#!/bin/bash
config_path=$1
train_output_path=$2
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ../train.py \
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=1
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=parallelwave_gan
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
\ No newline at end of file
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
......@@ -3,4 +3,9 @@
* tts0 - Tactron2
* tts1 - TransformerTTS
* tts2 - SpeedySpeech
* tts3 - FastSpeech2
* voc0 - WaveFlow
* voc1 - Parallel WaveGAN
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
# Tacotron2
# Tacotron2 with LJSpeech
PaddlePaddle dynamic graph implementation of Tacotron2, a neural network architecture for speech synthesis directly from text. The implementation is based on [Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884).
## Project Structure
```text
├── config.py # default configuration file
├── ljspeech.py # dataset and dataloader settings for LJSpeech
├── preprocess.py # script to preprocess LJSpeech dataset
├── synthesize.py # script to synthesize spectrogram from text
├── train.py # script for tacotron2 model training
├── synthesize.ipynb # notebook example for end-to-end TTS
```
## Dataset
We experiment with the LJSpeech dataset. Download and unzip [LJSpeech](https://keithito.com/LJ-Speech-Dataset/).
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
tar xjvf LJSpeech-1.1.tar.bz2
```
Then you need to preprocess the data by running ``preprocess.py``, the preprocessed data will be placed in ``--output`` directory.
## Get Started
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize mels.
```bash
python preprocess.py \
--input=${DATAPATH} \
--output=${PREPROCESSEDDATAPATH} \
-v \
./run.sh
```
For more help on arguments
``python preprocess.py --help``.
## Train the model
Tacotron2 model can be trained by running ``train.py``.
### Preprocess the dataset
```bash
python train.py \
--data=${PREPROCESSEDDATAPATH} \
--output=${OUTPUTPATH} \
--device=gpu \
./local/preprocess.sh ${conf_path}
```
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
Here's the complete help message.
```text
usage: train.py [-h] [--config FILE] [--data DATA_DIR] [--output OUTPUT_DIR]
[--checkpoint_path CHECKPOINT_PATH] [--device {cpu,gpu}]
[--nprocs NPROCS] [--opts ...]
optional arguments:
-h, --help show this help message and exit
--config FILE path of the config file to overwrite to default config
with.
--data DATA_DIR path to the datatset.
--output OUTPUT_DIR path to save checkpoint and logs.
--checkpoint_path CHECKPOINT_PATH
path of the checkpoint to load
--device {cpu,gpu} device type to use, cpu and gpu are supported.
--nprocs NPROCS number of parallel processes to use.
--opts ... options to overwrite --config file and the default
config, passing in KEY VALUE pairs
```
If you want to train on CPU, just set ``--device=cpu``.
If you want to train on multiple GPUs, just set ``--nprocs`` as num of GPU.
By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint. And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
By default, training will be resumed from the latest checkpoint in ``--output``, if you want to start a new training, please use a new ``${OUTPUTPATH}`` with no checkpoint.
And if you want to resume from an other existing model, you should set ``checkpoint_path`` to be the checkpoint path you want to load.
**Note: The checkpoint path cannot contain the file extension.**
For more help on arguments
``python train_transformer.py --help``.
## Synthesize
After training the Tacotron2, spectrogram can be synthesized by running ``synthesize.py``.
### Synthesize
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which synthesize **mels** from text_list here.
```bash
python synthesize.py \
--config=${CONFIGPATH} \
--checkpoint_path=${CHECKPOINTPATH} \
--input=${TEXTPATH} \
--output=${OUTPUTPATH}
--device=gpu
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name}
```
The ``${CONFIGPATH}`` needs to be matched with ``${CHECKPOINTPATH}``.
For more help on arguments
``python synthesize.py --help``.
Then you can find the spectrogram files in ``${OUTPUTPATH}``, and then they can be the input of vocoder like [waveflow](../waveflow/README.md#Synthesis) to get audio files.
```text
usage: synthesize.py [-h] [--config FILE] [--checkpoint_path CHECKPOINT_PATH]
[--input INPUT] [--output OUTPUT] [--device DEVICE]
[--opts ...] [-v]
generate mel spectrogram with TransformerTTS.
optional arguments:
-h, --help show this help message and exit
--config FILE extra config to overwrite the default config
--checkpoint_path CHECKPOINT_PATH
path of the checkpoint to load.
--input INPUT path of the text sentences
--output OUTPUT path to save outputs
--device DEVICE device type to use.
--opts ... options to overwrite --config file and the default
config, passing in KEY VALUE pairs
-v, --verbose print msg
```
**Ps.** You can use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder to synthesize mels to wavs. (Please refer to `synthesize.sh` in our LJSpeech waveflow example)
## Pretrained Models
Pretrained Models can be downloaded from links below. We provide 2 models with different configurations.
1. This model use a binary classifier to predict the stop token. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
2. This model does not have a stop token predictor. It uses the attention peak position to decided whether all the contents have been uttered. Also guided attention loss is used to speed up training. This model is trained with `configs/alternative.yaml`.[tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
## Notebook: End-to-end TTS
See [synthesize.ipynb](./synthesize.ipynb) for details about end-to-end TTS with tacotron2 and waveflow.
#!/bin/bash
preprocess_path=$1
python3 ${BIN_DIR}/preprocess.py \
--input=~/datasets/LJSpeech-1.1 \
--output=${preprocess_path} \
-v \
\ No newline at end of file
#!/bin/bash
train_output_path=$1
ckpt_name=$2
python3 ${BIN_DIR}/synthesize.py \
--config=${train_output_path}/config.yaml \
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
--input=${BIN_DIR}/../sentences_en.txt \
--output=${train_output_path}/test
--device=gpu
\ No newline at end of file
#!/bin/bash
preprocess_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--data=${preprocess_path} \
--output=${train_output_path} \
--device=gpu \
\ No newline at end of file
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=tacotron2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0
stage=0
stop_stage=100
preprocess_path=preprocessed_ljspeech
train_output_path=output
ckpt_name=step-35000
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${preprocess_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${train_output_path} ${ckpt_name} || exit -1
fi
......@@ -8,12 +8,21 @@ wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
```bash
tar xjvf LJSpeech-1.1.tar.bz2
```
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./preprocess.sh.
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
```text
......@@ -35,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, path of speech features, speaker and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
Here's the complete help message.
```text
......@@ -71,17 +80,6 @@ optional arguments:
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
## Pretrained Model
Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
TransformerTTS checkpoint contains files listed below.
```text
transformer_tts_ljspeech_ckpt_0.4
├── default.yaml # default config used to train transformer_tts
├── phone_id_map.txt # phone vocabulary file when training transformer_tts
├── snapshot_iter_201500.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training transformer_tts
```
## Synthesize
We use [waveflow](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/waveflow) as the neural vocoder.
Download Pretrained WaveFlow Model with residual channel equals 128 from [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip) and unzip it.
......@@ -94,9 +92,9 @@ waveflow_ljspeech_ckpt_0.3
├── config.yaml # default config used to train waveflow
└── step-2000000.pdparams # model parameters of waveflow
```
`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG]
......@@ -132,9 +130,9 @@ optional arguments:
--device DEVICE device type to use.
--verbose VERBOSE verbose.
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h]
......@@ -177,17 +175,30 @@ optional arguments:
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
You can use the following scripts to synthesize for `../sentences.txt` using pretrained transformer_tts and waveflow models.
## Pretrained Model
Pretrained Model can be downloaded here. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
TransformerTTS checkpoint contains files listed below.
```text
transformer_tts_ljspeech_ckpt_0.4
├── default.yaml # default config used to train transformer_tts
├── phone_id_map.txt # phone vocabulary file when training transformer_tts
├── snapshot_iter_201500.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training transformer_tts
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained transformer_tts and waveflow models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
python3 ${BIN_DIR}/synthesize_e2e.py \
--transformer-tts-config=transformer_tts_ljspeech_ckpt_0.4/default.yaml \
--transformer-tts-checkpoint=transformer_tts_ljspeech_ckpt_0.4/snapshot_iter_201500.pdz \
--transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--text=../sentences.txt \
--text=${BIN_DIR}/../sentences_en.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
......
......@@ -3,12 +3,12 @@
stage=1
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../`
config_path=$1
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../preprocess.py \
python3 ${BIN_DIR}/preprocess.py \
--dataset=ljspeech \
--rootdir=~/datasets/LJSpeech-1.1/ \
--dumpdir=dump \
......@@ -27,21 +27,21 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone to id, dev and test should use train's stats
echo "Normalize ..."
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--transformer-tts-config=conf/default.yaml \
--transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
python3 ${BIN_DIR}/synthesize.py \
--transformer-tts-config=${config_path} \
--transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--transformer-tts-stat=dump/train/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test \
--output-dir=${train_output_path}/test \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
--transformer-tts-config=conf/default.yaml \
--transformer-tts-checkpoint=exp/default/checkpoints/snapshot_iter_201500.pdz \
python3 ${BIN_DIR}/synthesize_e2e.py \
--transformer-tts-config=${config_path} \
--transformer-tts-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--transformer-tts-stat=dump/train/speech_stats.npy \
--waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
--waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
--text=../sentences.txt \
--output-dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--output-dir=${train_output_path}/test_e2e \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
python3 ../train.py \
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=2 \
--phones-dict=dump/phone_id_map.txt
001 Life was like a box of chocolates, you never know what you're gonna get.
002 With great power there must come great responsibility.
003 To be or not to be, that’s a question.
004 A man can be destroyed but not defeated
005 Do not, for one repulse, give up the purpose that you resolved to effort.
006 Death is just a part of life, something we're all destined to do.
007 I think it's hard winning a war with words.
008 Don’t argue with the people of strong determination, because they may change the fact!
009 Love you three thousand times.
\ No newline at end of file
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=transformer_tts
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_403.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
......@@ -9,13 +9,22 @@ Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./preprocess.sh
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -40,10 +49,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
Here's the complete help message.
```text
......@@ -78,18 +87,7 @@ optional arguments:
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_ljspeech_ckpt_0.5
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_100000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
## Synthesize
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/ljspeech/) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip) and unzip it.
```bash
......@@ -102,9 +100,9 @@ pwg_ljspeech_ckpt_0.5
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
......@@ -144,12 +142,12 @@ optional arguments:
--device DEVICE device type to use.
--verbose VERBOSE verbose.
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e_en.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
usage: synthesize_e2e_en.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
[--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
[--fastspeech2-stat FASTSPEECH2_STAT]
[--pwg-config PWG_CONFIG]
......@@ -191,18 +189,31 @@ optional arguments:
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_ljspeech_ckpt_0.5
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_100000.pdz # model parameters and optimizer states
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
python3 ${BIN_DIR}/synthesize_e2e_en.py \
--fastspeech2-config=fastspeech2_nosil_ljspeech_ckpt_0.5/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_ljspeech_ckpt_0.5/snapshot_iter_100000.pdz \
--fastspeech2-stat=fastspeech2_nosil_ljspeech_ckpt_0.5/speech_stats.npy \
--pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
--pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--text=../sentences_en.txt \
--text=${BIN_DIR}/../sentences_en.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
......
......@@ -3,7 +3,7 @@
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./ljspeech_alignment \
--output=durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../preprocess.py \
python3 ${BIN_DIR}/preprocess.py \
--dataset=ljspeech \
--rootdir=~/datasets/LJSpeech-1.1/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--num-cpu=8 \
--cut-sil=True
fi
......@@ -46,7 +46,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo "Normalize ..."
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
--pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test \
--output-dir=${train_output_path}/test \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_100000.pdz \
python3 ${BIN_DIR}/synthesize_e2e_en.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
--pwg-checkpoint=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
--text=../sentences_en.txt \
--output-dir=exp/default/test_e2e \
--text=${BIN_DIR}/../sentences_en.txt \
--output-dir=${train_output_path}/test_e2e \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
python3 ../train.py \
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=1 \
--phones-dict=dump/phone_id_map.txt
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_201.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# WaveFlow with LJSpeech
## Dataset
### Download the datasaet.
```bash
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
```
### Extract the dataset.
```bash
tar xjvf LJSpeech-1.1.tar.bz2
```
## Get Started
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Assume the path to the Tacotron2 generated mels is `../tts0/output/test`.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs from mels.
```bash
./run.sh
```
### Preprocess the dataset.
Assume the path to save the preprocessed dataset is `ljspeech_waveflow`. Run the command below to preprocess the dataset.
```bash
python preprocess.py --input=LJSpeech-1.1/ --output=ljspeech_waveflow
./local/preprocess.sh ${preprocess_path}
```
## Train the model
The training script requires 4 command line arguments.
`--data` is the path of the training dataset, `--output` is the path of the output directory (we recommend to use a subdirectory in `runs` to manage different experiments.)
`--device` should be "cpu" or "gpu", `--nprocs` is the number of processes to train the model in parallel.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
python train.py --data=ljspeech_waveflow/ --output=runs/test --device="gpu" --nprocs=1
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
```
The training script requires 4 command line arguments.
1. `--data` is the path of the training dataset.
2. `--output` is the path of the output directory.
3. `--device` should be "cpu" or "gpu"
4. `--nprocs` is the number of processes to train the model in parallel.
If you want distributed training, set a larger `--nprocs` (e.g. 4). Note that distributed training with cpu is not supported yet.
## Synthesize
Synthesize waveform. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
`--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
`--device` specifies to device to run synthesis on.
### Synthesize
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from mels.
```bash
python synthesize.py --input=mels/ --output=wavs/ --checkpoint_path='step-2000000' --device="gpu" --verbose
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name}
```
## Pretrained Model
Synthesize waveform.
1. We assume the `--input` is a directory containing several mel spectrograms(log magnitude) in `.npy` format.
2. The output would be saved in `--output` directory, containing several `.wav` files, each with the same name as the mel spectrogram does.
3. `--checkpoint_path` should be the path of the parameter file (`.pdparams`) to load. Note that the extention name `.pdparmas` is not included here.
4. `--device` specifies to device to run synthesis on.
## Pretrained Model
Pretrained Model with residual channel equals 128 can be downloaded here. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip).
#!/bin/bash
preprocess_path=$1
python3 ${BIN_DIR}/preprocess.py \
--input=~/datasets/LJSpeech-1.1 \
--output=${preprocess_path}
\ No newline at end of file
#!/bin/bash
input_mel_path=$1
train_output_path=$2
ckpt_name=$3
python ${BIN_DIR}/synthesize.py \
--input=${input_mel_path} \
--output=${train_output_path}/wavs/ \
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
--device="gpu" \
--verbose
\ No newline at end of file
#!/bin/bash
preprocess_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--data=${preprocess_path} \
--output=${train_output_path} \
--device="gpu" \
--nprocs=1
\ No newline at end of file
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=waveflow
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
\ No newline at end of file
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
preprocess_path=preprocessed_ljspeech
train_output_path=output
# mel generated by Tacotron2
input_mel_path=../tts0/output/test
ckpt_name=step-10000
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${preprocess_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${input_mel_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# Parallel WaveGAN with the LJSpeech-1.1 dataset
# Parallel WaveGAN with the LJSpeech-1.1
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
## Preprocess the dataset
## Dataset
### Download and Extract the datasaet
Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
### Get MFA results for silence trim
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./preprocess.sh
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -38,10 +44,10 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
Here's the complete help message.
......@@ -88,23 +94,10 @@ benchmark:
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
## Pretrained Models
Pretrained models can be downloaded here:
1. Parallel WaveGAN checkpoint. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip), which is used as a vocoder in the end-to-end inference script.
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_ljspeech_ckpt_0.5
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
## Synthesize
`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
### Synthesize
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
......@@ -127,10 +120,21 @@ optional arguments:
```
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory. If you use the pretrained model, use the `pwg_snapshot_iter_400000.pdz`.
2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
## Pretrained Models
Pretrained models can be downloaded here. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_ljspeech_ckpt_0.5
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_400000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
......@@ -3,7 +3,7 @@
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./ljspeech_alignment \
--output=durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../../preprocess.py \
python3 ${BIN_DIR}/../preprocess.py \
--rootdir=~/datasets/LJSpeech-1.1/ \
--dataset=ljspeech \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--cut-sil=True \
--num-cpu=20
fi
......@@ -39,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--stats=dump/train/feats_stats.npy
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--stats=dump/train/feats_stats.npy
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--stats=dump/train/feats_stats.npy
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--config=conf/default.yaml \
--checkpoint=exp/default/checkpoints/snapshot_iter_400000.pdz\
python3 ${BIN_DIR}/synthesize.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test
--output-dir=${train_output_path}/test
#!/bin/bash
config_path=$1
train_output_path=$2
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ../train.py \
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=1
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=parallelwave_gan
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
\ No newline at end of file
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# Speaker Encoder
This experiment trains a speaker encoder with speaker verification as its task. It is done as a part of the experiment of transfer learning from speaker verification to multispeaker text-to-speech synthesis, which can be found at [tacotron2_aishell3](../tacotron2_shell3). The trained speaker encoder is used to extract utterance embeddings from utterances.
## Model
The model used in this experiment is the speaker encoder with text independent speaker verification task in [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf). GE2E-softmax loss is used.
## File Structure
```text
ge2e
├── README.md
├── README_cn.md
├── audio_processor.py
├── config.py
├── dataset_processors.py
├── inference.py
├── preprocess.py
├── random_cycle.py
├── speaker_verification_dataset.py
└── train.py
```
## Download Datasets
Currently supported datasets are Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata, which can be downloaded from corresponding webpage.
1. Librispeech/train-other-500
An English multispeaker dataset,[URL](https://www.openslr.org/resources/12/train-other-500.tar.gz),only the `train-other-500` subset is used.
2. VoxCeleb1
An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev D should be downloaded, combined and extracted.
3. VoxCeleb2
An English multispeaker dataset,[URL](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html) , Audio Files from Dev A to Dev H should be downloaded, combined and extracted.
4. Aidatatang-200zh
A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/62/) .
5. magicdata
A Mandarin Chinese multispeaker dataset ,[URL](https://www.openslr.org/68/) .
If you want to use other datasets, you can also download and preprocess it as long as it meets the requirements described below.
## Preprocess Datasets
## Get Started
```bash
./run.sh
```
### Preprocess Datasets
`./local/preprocess.sh` calls `${BIN_DIR}/preprocess.py`.
```bash
./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names}
```
Assume datasets_root is `~/datasets/GE2E`, and it has the follow structure(We only use `train-other-500` for simplicity):
```Text
GE2E
├── LibriSpeech
└── (other datasets)
```
Multispeaker datasets are used as training data, though the transcriptions are not used. To enlarge the amount of data used for training, several multispeaker datasets are combined. The preporcessed datasets are organized in a file structure described below. The mel spectrogram of each utterance is save in `.npy` format. The dataset is 2-stratified (speaker-utterance). Since multiple datasets are combined, to avoid conflict in speaker id, dataset name is prepended to the speake ids.
```text
dataset_root
├── dataset01_speaker01/
   ├── utterance01.npy
   ├── utterance02.npy
   └── utterance03.npy
├── utterance01.npy
├── utterance02.npy
└── utterance03.npy
├── dataset01_speaker02/
   ├── utterance01.npy
   ├── utterance02.npy
   └── utterance03.npy
├── utterance01.npy
├── utterance02.npy
└── utterance03.npy
├── dataset02_speaker01/
   ├── utterance01.npy
   ├── utterance02.npy
   └── utterance03.npy
├── utterance01.npy
├── utterance02.npy
└── utterance03.npy
└── dataset02_speaker02/
   ├── utterance01.npy
   ├── utterance02.npy
   └── utterance03.npy
├── utterance01.npy
├── utterance02.npy
└── utterance03.npy
```
In `${BIN_DIR}/preprocess.py`:
1. `--datasets_root` is the directory that contains several extracted dataset
2. `--output_dir` is the directory to save the preprocessed dataset
3. `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.
Run the command to preprocess datasets.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path}
```
Here `--datasets_root` is the directory that contains several extracted dataset; `--output_dir` is the directory to save the preprocessed dataset; `--dataset_names` is the dataset to preprocess. If there are multiple datasets in `--datasets_root` to preprocess, the names can be joined with comma. Currently supported dataset names are librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh and magicdata.
## Training
When preprocessing is done, run the command below to train the mdoel.
```bash
python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
```
- `--data` is the path to the preprocessed dataset.
- `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
- `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
- `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.
In `${BIN_DIR}/train.py`:
1. `--data` is the path to the preprocessed dataset.
2. `--output` is the directory to save results,usually a subdirectory of `runs`.It contains visualdl log files, text log files, config file and a `checkpoints` directory, which contains parameter file and optimizer state file. If `--output` already has some training results in it, the most recent parameter file and optimizer state file is loaded before training.
3. `--device` is the device type to run the training, 'cpu' and 'gpu' are supported.
4. `--nprocs` is the number of replicas to run in multiprocessing based parallel training。Currently multiprocessing based parallel training is only enabled when using 'gpu' as the devicde.
5. `CUDA_VISIBLE_DEVICES` can be used to specify visible devices with cuda.
Other options are described below.
......@@ -99,29 +80,23 @@ Other options are described below.
- `--opts` is command line options to further override config files. It should be the last comman line options passed with multiple key-value pairs separated by spaces.
- `--checkpoint_path` specifies the checkpoiont to load before training, extension is not included. A parameter file ( `.pdparams`) and an optimizer state file ( `.pdopt`) with the same name is used. This option has a higher priority than auto-resuming from the `--output` directory.
## Pretrained Model
The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
## Inference
### Inference
When training is done, run the command below to generate utterance embedding for each utterance in a dataset.
`./local/inference.sh` calls `${BIN_DIR}/inference.py`.
```bash
python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name}
```
In `${BIN_DIR}/inference.py`:
1. `--input` is the path of the dataset used for inference.
2. `--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
3. `--checkpoint_path` is the path of the checkpoint to use, extension not included.
4. `--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
5. `--device` and `--opts` have the same meaning as in the training script.
`--input` is the path of the dataset used for inference.
`--output` is the directory to save the processed results. It has the same file structure as the input dataset. Each utterance in the dataset has a corrsponding utterance embedding file in `*.npy` format.
`--checkpoint_path` is the path of the checkpoint to use, extension not included.
`--pattern` is the wildcard pattern to filter audio files for inference, defaults to `*.wav`.
## Pretrained Model
The pretrained model is first trained to 1560k steps at Librispeech-other-500 and voxceleb1. Then trained at aidatatang_200h and magic_data to 3000k steps.
`--device` and `--opts` have the same meaning as in the training script.
Download URL [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip).
## References
......
#!/bin/bash
#generate utterance embedding for each utterance in a dataset.
infer_input=$1
infer_output=$2
train_output_path=$3
ckpt_name=$4
python3 ${BIN_DIR}/inference.py \
--input=${infer_input} \
--output=${infer_output} \
--checkpoint_path=${train_output_path}/checkpoints/${ckpt_name} \
--device="gpu"
#!/bin/bash
datasets_root=$1
preprocess_path=$2
dataset_names=$3
python3 ${BIN_DIR}/preprocess.py \
--datasets_root=${datasets_root} \
--output_dir=${preprocess_path} \
--dataset_names=${dataset_names}
\ No newline at end of file
#!/bin/bash
preprocess_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--data=${preprocess_path} \
--output=${train_output_path} \
--device="gpu" \
--nprocs=1
\ No newline at end of file
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=ge2e
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0
stage=0
stop_stage=100
datasets_root=~/datasets/GE2E
preprocess_path=dump
dataset_names=librispeech_other
train_output_path=output
infer_input=infer_input
infer_output=infer_output
ckpt_name=step-10000
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${datasets_root} ${preprocess_path} ${dataset_names} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${preprocess_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${infer_input} ${infer_output} ${train_output_path} ${ckpt_name} || exit -1
fi
# Punctation Restoration
Please using [PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) to do this task.
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from collections import defaultdict
from pathlib import Path
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
from pathlib import Path
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
from pathlib import Path
......
......@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from collections import OrderedDict
from pathlib import Path
import logging
def detect_oov(corpus_dir, lexicon_path, transcription_pattern="*.lab"):
......
......@@ -20,9 +20,8 @@ than words are used in transcriptions produced by `reorganize_baker.py`.
We make this choice to better leverage other software for chinese text to
pinyin tools like pypinyin. This is the convention for G2P in Chinese.
"""
import re
import argparse
import re
from collections import OrderedDict
INITIALS = [
......
#!/bin/bash
python3 ../synthesize.py \
--config=conf/default.yaml \
--checkpoint=exp/default/checkpoints/snapshot_iter_35000.pdz_bak\
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test
# VCTK
* tts0 - Tactron2
* tts1 - TransformerTTS
* tts2 - SpeedySpeech
* tts3 - FastSpeech2
* voc0 - WaveFlow
* voc1 - Parallel WaveGAN
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
001 凯莫瑞安联合体的经济崩溃,迫在眉睫。
002 对于所有想要离开那片废土,去寻找更美好生活的人来说。
003 克哈,是你们所有人安全的港湾。
004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。
005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。
006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。
007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。
008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。
009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。
010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。
011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。
012 法治是我们的命脉,然而它却受到前所未有的挑战。
013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。
014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。
015 永远记住,谁才是最能保护你们的人。
016 不要听信别人的谗言,我不是什么克隆人。
\ No newline at end of file
......@@ -12,13 +12,22 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith
1. `p315`, because no txt for it.
2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
Assume the path to the MFA result of VCTK is `./vctk_alignment`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./preprocess.sh
./run.sh
```
### Preprocess the dataset
```bash
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -43,11 +52,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
`./local/train.sh` calls `${BIN_DIR}/train.py`.
Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
......@@ -81,14 +90,23 @@ optional arguments:
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
## Pretrained Model
## Synthesize
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)and unzip it.
```bash
unzip pwg_vctk_ckpt_0.5.zip
```
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_vctk_ckpt_0.5
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
......@@ -128,17 +146,20 @@ optional arguments:
--device DEVICE device type to use.
--verbose VERBOSE verbose.
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e_en.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
usage: multi_spk_synthesize_e2e_en.py [-h]
[--fastspeech2-config FASTSPEECH2_CONFIG]
[--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
[--fastspeech2-stat FASTSPEECH2_STAT]
[--pwg-config PWG_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT]
[--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
[--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT]
[--text TEXT] [--output-dir OUTPUT_DIR]
[--device DEVICE] [--verbose VERBOSE]
......@@ -161,6 +182,8 @@ optional arguments:
spectrogram when training parallel wavegan.
--phones-dict PHONES_DICT
phone vocabulary file.
--speaker-dict SPEAKER_DICT
speaker id map file.
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
--output-dir OUTPUT_DIR
output dir.
......@@ -175,7 +198,34 @@ optional arguments:
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device is` the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
You can use the following scripts to synthesize for `../sentences_en.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_vctk_ckpt_0.5
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_66200.pdz # model parameters and optimizer states
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
--fastspeech2-config=fastspeech2_nosil_vctk_ckpt_0.5/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_96400.pdz \
--fastspeech2-stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \
--pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
--pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
--pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \
--speaker-dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt
```
#!/bin/bash
stage=1
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,18 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./vctk_alignment \
--output durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../preprocess.py \
python3 ${BIN_DIR}/preprocess.py \
--dataset=vctk \
--rootdir=~/datasets/VCTK-Corpus-0.92/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True
fi
......@@ -46,7 +46,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone/speaker to id, dev and test should use train's stats
echo "Normalize ..."
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
......@@ -64,7 +64,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
python3 ../normalize.py \
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ../synthesize.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak\
python3 ${BIN_DIR}/synthesize.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
--pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
--pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=exp/default/test \
--output-dir=${train_output_path}/test \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
--fastspeech2-config=conf/default.yaml \
--fastspeech2-checkpoint=exp/default/checkpoints/snapshot_iter_32769.pdz_bak \
python3 ${BIN_DIR}/multi_spk_synthesize_e2e_en.py \
--fastspeech2-config=${config_path} \
--fastspeech2-checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--fastspeech2-stat=dump/train/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences_en.txt \
--output-dir=exp/default/test_e2e \
--pwg-config=pwg_vctk_ckpt_0.5/pwg_default.yaml \
--pwg-checkpoint=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
--pwg-stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
--text=${BIN_DIR}/../sentences_en.txt \
--output-dir=${train_output_path}/test_e2e \
--device="gpu" \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
#!/bin/bash
python3 ../train.py \
config_path=$1
train_output_path=$2
python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=2 \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=fastspeech2
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/${MODEL}
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_331.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# Parallel WaveGAN with VCTK
This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443).
## Preprocess the dataset
## Dataset
### Download and Extract the datasaet
Download VCTK-0.92 from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
......@@ -11,12 +12,21 @@ ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://gith
1. `p315`, because no txt for it.
2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for them.
### Preprocess the dataset
## Get Started
Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
Assume the path to the MFA result of VCTK is `./vctk_alignment`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
```bash
./preprocess.sh
./run.sh
```
### Preprocess the dataset
```bash
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
......@@ -38,12 +48,11 @@ The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains id and paths to spectrogam of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
`./local/train.sh` calls `${BIN_DIR}/train.py`.
Here's the complete help message.
```text
......@@ -88,15 +97,10 @@ benchmark:
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
## Pretrained Models
## Synthesize
`synthesize.sh` calls `../synthesize.py `, which can synthesize waveform from `metadata.jsonl`.
### Synthesize
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--config CONFIG] [--checkpoint CHECKPOINT]
......@@ -124,5 +128,16 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files.
5. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported.
## Pretrained Models
Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip).
Parallel WaveGAN checkpoint contains files listed below.
```text
pwg_vctk_ckpt_0.5
├── pwg_default.yaml # default config used to train parallel wavegan
├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
......@@ -3,7 +3,7 @@
stage=0
stop_stage=100
export MAIN_ROOT=`realpath ${PWD}/../../../../`
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
......@@ -11,17 +11,18 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./vctk_alignment \
--output=durations.txt \
--config=conf/default.yaml
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ../../preprocess.py \
python3 ${BIN_DIR}/../preprocess.py \
--rootdir=~/datasets/VCTK-Corpus-0.92/ \
--dataset=vctk \
--dumpdir=dump \
--dur-file=durations.txt \
--config=conf/default.yaml \
--config=${config_path} \
--cut-sil=True \
--num-cpu=20
fi
......@@ -38,16 +39,16 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--stats=dump/train/feats_stats.npy
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--stats=dump/train/feats_stats.npy
python3 ../../normalize.py \
python3 ${BIN_DIR}/../normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--stats=dump/train/feats_stats.npy
......
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=${train_output_path}/test
#!/bin/bash
config_path=$1
train_output_path=$2
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ../train.py \
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=conf/default.yaml \
--output-dir=exp/default \
--config=${config_path} \
--output-dir=${train_output_path} \
--nprocs=1
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=parallelwave_gan
export BIN_DIR=${MAIN_ROOT}/parakeet/exps/gan_vocoder/${MODEL}
\ No newline at end of file
#!/bin/bash
set -e
source path.sh
gpus=0
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_5000.pdz
# with the following command, you can choice the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
# Speaker Encoder
本实验是的在多说话人数据集上以 Speaker Verification 为任务训练一个 speaker encoder, 这是作为 transfer learning from speaker verification to multispeaker text-to-speech synthesis 实验的一部分, 可以在 [tacotron2_aishell3](../tacotron2_aishell3) 中找到。用训练好的模型来提取音频的 utterance embedding.
## 模型
本实验使用的模型是 [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf) 中的 speaker encoder text independent 模型。使用的是 GE2E softmax 损失函数。
## 目录结构
```text
ge2e
├── README_cn.md
├── audio_processor.py
├── config.py
├── dataset_processors.py
├── inference.py
├── preprocess.py
├── random_cycle.py
├── speaker_verification_dataset.py
└── train.py
```
## 数据集下载
本实验支持了 Librispeech-other-500, VoxCeleb, VoxCeleb2,ai-datatang-200zh, magicdata 数据集。可以在对应的页面下载。
1. Librispeech/train-other-500
英文多说话人数据集,[下载链接](https://www.openslr.org/resources/12/train-other-500.tar.gz),我们的实验中仅用到了 train-other-500 这个子集。
2. VoxCeleb1
英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox1.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev D 四个压缩文件并合并解压。
3. VoxCeleb2
英文多说话人数据集,[下载链接](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/vox2.html),需要下载其中的 Audio Files 中的 Dev A 到 Dev H 八个压缩文件并合并解压。
4. Aidatatang-200zh
中文多说话人数据集,[下载链接](https://www.openslr.org/62/)
5. magicdata
中文多说话人数据集,[下载链接](https://www.openslr.org/68/)
如果用户需要使用其他的数据集,也可以自行下载并进行数据处理,只要符合如下的要求。
## 数据集预处理
训练中使用的数据集是多说话人数据集,transcription 并不会被使用。为了扩大数据的量,训练过程可以将多个数据集合并为一个。处理后的文件结果组织方式如下,每个句子的频谱存储为 `.npy` 格式。以 speaker-utterance 的两层目录结构存储。因为合并数据集的原因,为了避免 speaker id 冲突,dataset 名会被添加到 speaker id 前面。
```text
dataset_root
├── dataset01_speaker01/
│   ├── utterance01.npy
│   ├── utterance02.npy
│   └── utterance03.npy
├── dataset01_speaker02/
│   ├── utterance01.npy
│   ├── utterance02.npy
│   └── utterance03.npy
├── dataset02_speaker01/
│   ├── utterance01.npy
│   ├── utterance02.npy
│   └── utterance03.npy
└── dataset02_speaker02/
   ├── utterance01.npy
   ├── utterance02.npy
   └── utterance03.npy
```
运行数据处理脚本
```bash
python preprocess.py --datasets_root=<datasets_root> --output_dir=<output_dir> --dataset_names=<dataset_names>
```
其中 datasets_root 是包含多个原始数据集的路径,--output_dir 是多个数据集合并后输出的路径,dataset_names 是数据集的名称,多个数据集可以用逗号分割,比如 'librispeech_other, voxceleb1'. 目前支持的数据集有 librispeech_other, voxceleb1, voxceleb2, aidatatang_200zh, magicdata.
## 训练
数据处理完成后,使用如下的脚本训练。
```bash
python train.py --data=<data_path> --output=<output> --device="gpu" --nprocs=1
```
- `--data` 是处理后的数据集路径。
- `--output` 是训练结果的保存路径,一般使用 runs 下的一个子目录。保存结果包含 visualdl 的 log 文件,文本 log 记录,运行 config 备份,以及 checkpoints 目录,里面包含参数文件和优化器状态文件。如果指定的 output 路径包含此前的训练结果,训练前会自动加载最近的参数文件和优化器状态文件。
- `--device` 是运行设备,目前支持 'cpu' 和 'gpu'.
- `--nprocs` 是指定运行进程数。目前仅在使用 'gpu' 是支持多进程训练。可以配合 `CUDA_VISIBLE_DEVICES` 环境变量指定可见卡号。
另外还有几个选项。
- `--config` 是用于覆盖默认配置(默认配置可以查看 `config.py`) 的配置文件,为 `.yaml` 文件。
- `--opts` 是用命令行参数进一步覆盖配置。这是最后一个传入的命令行选项,用多组空格分隔的 KEY VALUE 对的方式传入。
- `--checkpoint_path` 指定从中恢复的 checkpoint, 不需要包含扩展名。同名的参数文件( `.pdparams`) 和优化器文件( `.pdopt`)会被加载以恢复训练。这个参数指定的恢复训练优先级高于自动从 `output` 文件夹中恢复训练。
## 预训练模型
预训练模型是在 Librispeech-other-500 和 voxceleb1 上训练到 1560k steps 后用 aidatatang_200h 和 magic_data 训练到 3000k 的结果。
下载链接 [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
## 预测
使用训练好的模型进行预测,对一个数据集中的所有 utterance 生成一个 embedding.
```bash
python inference.py --input=<input> --output=<output> --checkpoint_path=<checkpoint_path> --device="gpu"
```
- `--input` 是需要处理的数据集的路径。
- `--output` 是处理的结果,它会保持和 `--input` 相同的文件夹结构,对应 input 中的每一个音频文件会有一个同名的 `*.npy` 文件,是从这个音频文件中提取到的 utterance embedding.
- `--checkpoint_path` 为用于预测的参数文件路径,不包含扩展名。
- `--pattern` 是用于筛选数据集中需要处理的音频文件的通配符模式,默认为 `*.wav`.
- `--device``--opts` 的语义和训练脚本一致。
## 参考文献
1. [GENERALIZED END-TO-END LOSS FOR SPEAKER VERIFICATION](https://arxiv.org/pdf/1710.10467.pdf)
2. [Transfer Learning from Speaker Verification toMultispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf)
......@@ -11,10 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from . import data
from . import datasets
from . import exps
from . import frontend
from . import models
from . import modules
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import AudioProcessor
from .spec_normalizer import LogMagnitude
from .spec_normalizer import NormalizerBase
......@@ -13,6 +13,5 @@
# limitations under the License.
"""Parakeet's infrastructure for data processing.
"""
from .dataset import *
from .batch import *
from .dataset import *
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .common import *
from .ljspeech import *
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -20,13 +19,14 @@ import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.frontend.zh_frontend import Frontend
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
from yacs.config import CfgNode
def evaluate(args, fastspeech2_config, pwg_config):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -20,13 +19,14 @@ import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.frontend import English
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
from yacs.config import CfgNode
def evaluate(args, fastspeech2_config, pwg_config):
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import argparse
import logging
from operator import itemgetter
......@@ -20,10 +19,11 @@ from pathlib import Path
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
def main():
"""Run preprocessing process."""
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from concurrent.futures import ThreadPoolExecutor
......@@ -26,6 +25,8 @@ import librosa
import numpy as np
import tqdm
import yaml
from yacs.config import CfgNode
from parakeet.data.get_feats import Energy
from parakeet.data.get_feats import LogMelFBank
from parakeet.data.get_feats import Pitch
......@@ -34,7 +35,6 @@ from parakeet.datasets.preprocess_utils import get_input_token
from parakeet.datasets.preprocess_utils import get_phn_dur
from parakeet.datasets.preprocess_utils import get_spk_id_map
from parakeet.datasets.preprocess_utils import merge_silence
from yacs.config import CfgNode
def process_sentence(config: Dict[str, Any],
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -22,6 +21,7 @@ import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -20,13 +19,14 @@ import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.frontend.zh_frontend import Frontend
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
from yacs.config import CfgNode
def evaluate(args, fastspeech2_config, pwg_config):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -21,6 +20,7 @@ import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.frontend import English
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
......
......@@ -11,11 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import shutil
from pathlib import Path
import jsonlines
import numpy as np
......@@ -25,9 +25,12 @@ from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from parakeet.datasets.data_table import DataTable
from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
from visualdl import LogWriter
from yacs.config import CfgNode
from parakeet.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
from parakeet.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
from parakeet.datasets.data_table import DataTable
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Evaluator
from parakeet.models.fastspeech2 import FastSpeech2Updater
......@@ -36,9 +39,6 @@ from parakeet.training.extensions.visualizer import VisualDL
from parakeet.training.optimizer import build_optimizers
from parakeet.training.seeding import seed_everything
from parakeet.training.trainer import Trainer
from pathlib import Path
from visualdl import LogWriter
from yacs.config import CfgNode
def train_sp(args, config):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import argparse
import logging
from operator import itemgetter
......@@ -20,10 +19,11 @@ from pathlib import Path
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
def main():
"""Run preprocessing process."""
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,11 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pathlib import Path
from timer import timer
import jsonlines
import numpy as np
......@@ -23,9 +21,11 @@ import paddle
import soundfile as sf
import yaml
from paddle import distributed as dist
from timer import timer
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.models.parallel_wavegan import PWGGenerator
from yacs.config import CfgNode
def main():
......
......@@ -11,10 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import logging
import os
from pathlib import Path
import librosa
......@@ -22,20 +21,12 @@ import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.data.get_feats import LogMelFBank
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
from yacs.config import CfgNode as Configuration
def get_cfg_default():
config_path = (Path(__file__).parent / "conf" / "default.yaml").resolve()
with open(config_path, 'rt') as f:
_C = yaml.safe_load(f)
_C = Configuration(_C)
config = _C.clone()
return config
def evaluate(args, config):
......@@ -91,7 +82,7 @@ def main():
description="Synthesize with parallel wavegan.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config.")
"--config", type=str, help="parallel wavegan config file.")
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
parser.add_argument(
"--stat",
......@@ -108,9 +99,8 @@ def main():
paddle.set_device(args.device)
config = get_cfg_default()
if args.config:
config.merge_from_file(args.config)
with open(args.config) as f:
config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
......
......@@ -11,11 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import shutil
from pathlib import Path
import jsonlines
import numpy as np
......@@ -28,20 +28,20 @@ from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle.optimizer import Adam # No RAdaom
from paddle.optimizer.lr import StepDecay
from visualdl import LogWriter
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.datasets.vocoder_batch_fn import Clip
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGDiscriminator
from parakeet.models.parallel_wavegan import PWGUpdater
from parakeet.models.parallel_wavegan import PWGEvaluator
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGUpdater
from parakeet.modules.stft_loss import MultiResolutionSTFTLoss
from parakeet.training.extensions.snapshot import Snapshot
from parakeet.training.extensions.visualizer import VisualDL
from parakeet.training.seeding import seed_everything
from parakeet.training.trainer import Trainer
from pathlib import Path
from visualdl import LogWriter
from yacs.config import CfgNode
def train_sp(args, config):
......
......@@ -11,10 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from concurrent.futures import ThreadPoolExecutor
from operator import itemgetter
from pathlib import Path
from typing import Any
from typing import Dict
from typing import List
......@@ -24,12 +25,11 @@ import librosa
import numpy as np
import tqdm
import yaml
from concurrent.futures import ThreadPoolExecutor
from yacs.config import CfgNode
from parakeet.data.get_feats import LogMelFBank
from parakeet.datasets.preprocess_utils import get_phn_dur
from parakeet.datasets.preprocess_utils import merge_silence
from pathlib import Path
from yacs.config import CfgNode
def process_sentence(config: Dict[str, Any],
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import struct
from pathlib import Path
from warnings import warn
import struct
from scipy.ndimage.morphology import binary_dilation
import numpy as np
import librosa
import numpy as np
from scipy.ndimage.morphology import binary_dilation
try:
import webrtcvad
......@@ -97,7 +96,7 @@ def trim_long_silences(wav,
return ret[width - 1:] / width
audio_mask = moving_average(voice_flags, vad_moving_average_width)
audio_mask = np.round(audio_mask).astype(np.bool)
audio_mask = np.round(audio_mask).astype(bool)
# Dilate the voiced regions
audio_mask = binary_dilation(audio_mask,
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode
_C = CfgNode()
......
......@@ -11,16 +11,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import multiprocessing as mp
from functools import partial
from typing import List
from pathlib import Path
import multiprocessing as mp
from typing import List
import numpy as np
from tqdm import tqdm
from audio_processor import SpeakerVerificationPreprocessor
from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
def _process_utterance(path_pair, processor: SpeakerVerificationPreprocessor):
......
......@@ -11,19 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import tqdm
import paddle
import numpy as np
import paddle
import tqdm
from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from parakeet.exps.ge2e.config import get_cfg_defaults
from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from audio_processor import SpeakerVerificationPreprocessor
from config import get_cfg_defaults
def embed_utterance(processor, model, fpath_or_wav):
# audio processor
......
......@@ -11,14 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from config import get_cfg_defaults
from audio_processor import SpeakerVerificationPreprocessor
from dataset_processors import (process_librispeech, process_voxceleb1,
process_voxceleb2, process_aidatatang_200zh,
process_magicdata)
from parakeet.exps.ge2e.config import get_cfg_defaults
from parakeet.exps.ge2e.dataset_processors import process_aidatatang_200zh
from parakeet.exps.ge2e.dataset_processors import process_librispeech
from parakeet.exps.ge2e.dataset_processors import process_magicdata
from parakeet.exps.ge2e.dataset_processors import process_voxceleb1
from parakeet.exps.ge2e.dataset_processors import process_voxceleb2
if __name__ == "__main__":
parser = argparse.ArgumentParser(
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
......
......@@ -11,14 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from pathlib import Path
import numpy as np
from paddle.io import Dataset, BatchSampler
from paddle.io import BatchSampler
from paddle.io import Dataset
from random_cycle import random_cycle
from parakeet.exps.ge2e.random_cycle import random_cycle
class MultiSpeakerMelDataset(Dataset):
......
......@@ -11,23 +11,21 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from paddle import distributed as dist
from paddle.optimizer import Adam
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn.clip import ClipGradByGlobalNorm
from paddle.optimizer import Adam
from parakeet.exps.ge2e.config import get_cfg_defaults
from parakeet.exps.ge2e.speaker_verification_dataset import Collate
from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerMelDataset
from parakeet.exps.ge2e.speaker_verification_dataset import MultiSpeakerSampler
from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from parakeet.training import ExperimentBase
from parakeet.training import default_argument_parser
from speaker_verification_dataset import MultiSpeakerMelDataset
from speaker_verification_dataset import MultiSpeakerSampler
from speaker_verification_dataset import Collate
from config import get_cfg_defaults
from parakeet.training import ExperimentBase
class Ge2eExperiment(ExperimentBase):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,13 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pathlib import Path
import soundfile as sf
from paddle import inference
from parakeet.frontend.zh_frontend import Frontend
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import argparse
import logging
from operator import itemgetter
......@@ -20,10 +19,11 @@ from pathlib import Path
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
def main():
"""Run preprocessing process."""
......
......@@ -11,27 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import re
from concurrent.futures import ThreadPoolExecutor
from operator import itemgetter
from pathlib import Path
from typing import Any
from typing import Dict
from typing import List
import argparse
import jsonlines
import librosa
import numpy as np
import re
import tqdm
import yaml
from concurrent.futures import ThreadPoolExecutor
from yacs.config import CfgNode
from parakeet.data.get_feats import LogMelFBank
from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length
from parakeet.datasets.preprocess_utils import get_phones_tones
from parakeet.datasets.preprocess_utils import get_phn_dur
from parakeet.datasets.preprocess_utils import get_phones_tones
from parakeet.datasets.preprocess_utils import merge_silence
from pathlib import Path
from yacs.config import CfgNode
def process_sentence(config: Dict[str, Any],
......
......@@ -11,25 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import logging
import argparse
import logging
import os
from pathlib import Path
import jsonlines
import numpy as np
import soundfile as sf
import paddle
import soundfile as sf
import yaml
from paddle import jit
from paddle.static import InputSpec
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechInference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechInference
from parakeet.modules.normalizer import ZScore
......
......@@ -11,25 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
from pathlib import Path
import numpy as np
import soundfile as sf
import paddle
import soundfile as sf
import yaml
from paddle import jit
from paddle.static import InputSpec
from yacs.config import CfgNode
from parakeet.frontend.zh_frontend import Frontend
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechInference
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechInference
from parakeet.modules.normalizer import ZScore
from yacs.config import CfgNode
def evaluate(args, speedyspeech_config, pwg_config):
......
......@@ -11,22 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
import shutil
from pathlib import Path
import jsonlines
import numpy as np
import paddle
import yaml
from paddle import distributed as dist
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from parakeet.datasets.data_table import DataTable
from visualdl import LogWriter
from yacs.config import CfgNode
from parakeet.datasets.am_batch_fn import speedyspeech_batch_fn
from parakeet.datasets.data_table import DataTable
from parakeet.models.speedyspeech import SpeedySpeech
from parakeet.models.speedyspeech import SpeedySpeechEvaluator
from parakeet.models.speedyspeech import SpeedySpeechUpdater
......@@ -35,9 +38,6 @@ from parakeet.training.extensions.visualizer import VisualDL
from parakeet.training.optimizer import build_optimizers
from parakeet.training.seeding import seed_everything
from parakeet.training.trainer import Trainer
from pathlib import Path
from visualdl import LogWriter
from yacs.config import CfgNode
def train_sp(args, config):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN
_C = CN()
......
......@@ -11,14 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import pickle
from pathlib import Path
import numpy as np
from paddle.io import Dataset
from parakeet.data.batch import batch_spec, batch_text_id
from parakeet.data.batch import batch_spec
from parakeet.data.batch import batch_text_id
class LJSpeech(Dataset):
......
......@@ -11,21 +11,20 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import pickle
import argparse
from pathlib import Path
import tqdm
import numpy as np
import tqdm
from parakeet.audio import AudioProcessor
from parakeet.audio import LogMagnitude
from parakeet.datasets import LJSpeechMetaData
from parakeet.audio import AudioProcessor, LogMagnitude
from parakeet.exps.tacotron2.config import get_cfg_defaults
from parakeet.frontend import EnglishCharacter
from config import get_cfg_defaults
def create_dataset(config, source_path, target_path, verbose=False):
# create output dir
......
......@@ -11,20 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import paddle
import numpy as np
import paddle
from matplotlib import pyplot as plt
from parakeet.exps.tacotron2.config import get_cfg_defaults
from parakeet.frontend import EnglishCharacter
from parakeet.models.tacotron2 import Tacotron2
from parakeet.utils import display
from config import get_cfg_defaults
def main(config, args):
paddle.set_device(args.device)
......@@ -36,8 +34,13 @@ def main(config, args):
# inputs
input_path = Path(args.input).expanduser()
sentences = []
with open(input_path, "rt") as f:
sentences = f.readlines()
for line in f:
line_list = line.strip().split()
utt_id = line_list[0]
sentence = " ".join(line_list[1:])
sentences.append((utt_id, sentence))
if args.output is None:
output_dir = input_path.parent / "synthesis"
......
......@@ -11,23 +11,25 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from collections import defaultdict
import numpy as np
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle import distributed as dist
from parakeet.data import dataset
from parakeet.exps.tacotron2.config import get_cfg_defaults
from parakeet.exps.tacotron2.ljspeech import LJSpeech
from parakeet.exps.tacotron2.ljspeech import LJSpeechCollector
from parakeet.models.tacotron2 import Tacotron2
from parakeet.models.tacotron2 import Tacotron2Loss
from parakeet.training.cli import default_argument_parser
from parakeet.training.experiment import ExperimentBase
from parakeet.utils import display, mp_tools
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
from config import get_cfg_defaults
from ljspeech import LJSpeech, LJSpeechCollector
from parakeet.utils import display
from parakeet.utils import mp_tools
class Experiment(ExperimentBase):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import argparse
import logging
from operator import itemgetter
......@@ -20,10 +19,11 @@ from pathlib import Path
import jsonlines
import numpy as np
from parakeet.datasets.data_table import DataTable
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from parakeet.datasets.data_table import DataTable
def main():
"""Run preprocessing process."""
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from concurrent.futures import ThreadPoolExecutor
from operator import itemgetter
......@@ -25,9 +24,10 @@ import librosa
import numpy as np
import tqdm
import yaml
from yacs.config import CfgNode as Configuration
from parakeet.data.get_feats import LogMelFBank
from parakeet.frontend import English
from yacs.config import CfgNode as Configuration
def get_lj_sentences(file_name, frontend):
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -22,6 +21,7 @@ import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.datasets.data_table import DataTable
from parakeet.models.transformer_tts import TransformerTTS
from parakeet.models.transformer_tts import TransformerTTSInference
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pathlib import Path
......@@ -21,6 +20,7 @@ import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.frontend import English
from parakeet.models.transformer_tts import TransformerTTS
from parakeet.models.transformer_tts import TransformerTTSInference
......
......@@ -11,10 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import logging
import os
import shutil
from pathlib import Path
......@@ -26,18 +25,19 @@ from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from parakeet.datasets.data_table import DataTable
from visualdl import LogWriter
from yacs.config import CfgNode
from parakeet.datasets.am_batch_fn import transformer_single_spk_batch_fn
from parakeet.datasets.data_table import DataTable
from parakeet.models.transformer_tts import TransformerTTS
from parakeet.models.transformer_tts import TransformerTTSUpdater
from parakeet.models.transformer_tts import TransformerTTSEvaluator
from parakeet.models.transformer_tts import TransformerTTSUpdater
from parakeet.training.extensions.snapshot import Snapshot
from parakeet.training.extensions.visualizer import VisualDL
from parakeet.training.optimizer import build_optimizers
from parakeet.training.seeding import seed_everything
from parakeet.training.trainer import Trainer
from visualdl import LogWriter
from yacs.config import CfgNode
def train_sp(args, config):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,16 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pickle
from pathlib import Path
import numpy as np
from paddle.io import Dataset
from parakeet.frontend import Vocab
from parakeet.data import batch_text_id, batch_spec
from preprocess_transcription import _phones, _tones
from parakeet.data import batch_spec
from parakeet.data import batch_text_id
from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _phones
from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import _tones
from parakeet.frontend import Vocab
voc_phones = Vocab(sorted(list(_phones)))
print("vocab_phones:\n", voc_phones)
......
......@@ -11,10 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from typing import Tuple
from typing import List, Tuple
from pypinyin import lazy_pinyin, Style
from preprocess_transcription import split_syllable
from pypinyin import lazy_pinyin
from pypinyin import Style
from parakeet.exps.voice_cloning.tacotron2_ge2e.preprocess_transcription import split_syllable
def convert_to_pinyin(text: str) -> List[str]:
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN
_C = CN()
......
......@@ -11,19 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import multiprocessing as mp
from functools import partial
from pathlib import Path
import numpy as np
from parakeet.audio import AudioProcessor
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
import tqdm
from config import get_cfg_defaults
from parakeet.audio import AudioProcessor
from parakeet.audio.spec_normalizer import LogMagnitude
from parakeet.audio.spec_normalizer import NormalizerBase
from parakeet.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
def extract_mel(fname: Path,
......@@ -47,7 +46,7 @@ def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
output_dir.mkdir(parents=True, exist_ok=True)
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
config.hop_length, config.n_mels, config.fmin,
config.hop_length, config.d_mels, config.fmin,
config.fmax)
n = LogMagnitude(1e-5)
......
......@@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import re
import pickle
import re
from pathlib import Path
import yaml
import tqdm
import yaml
zh_pattern = re.compile("[\u4e00-\u9fa5]")
......
......@@ -11,17 +11,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
from multiprocessing import Pool
from functools import partial
from multiprocessing import Pool
from pathlib import Path
import numpy as np
import librosa
import numpy as np
import soundfile as sf
from tqdm import tqdm
from praatio import tgio
from tqdm import tqdm
def get_valid_part(fpath):
......
......@@ -11,26 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from pathlib import Path
from collections import defaultdict
from pathlib import Path
import numpy as np
from matplotlib import pyplot as plt
import paddle
from matplotlib import pyplot as plt
from paddle import distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from parakeet.data import dataset
from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import AiShell3
from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import collate_aishell3_examples
from parakeet.exps.voice_cloning.tacotron2_ge2e.config import get_cfg_defaults
from parakeet.models.tacotron2 import Tacotron2
from parakeet.models.tacotron2 import Tacotron2Loss
from parakeet.training.cli import default_argument_parser
from parakeet.training.experiment import ExperimentBase
from parakeet.utils import display, mp_tools
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
from config import get_cfg_defaults
from aishell3 import AiShell3, collate_aishell3_examples
from parakeet.utils import display
from parakeet.utils import mp_tools
class Experiment(ExperimentBase):
......@@ -192,9 +193,9 @@ class Experiment(ExperimentBase):
def setup_dataloader(self):
args = self.args
config = self.config
ljspeech_dataset = AiShell3(args.data)
aishell3_dataset = AiShell3(args.data)
valid_set, train_set = dataset.split(ljspeech_dataset,
valid_set, train_set = dataset.split(aishell3_dataset,
config.data.valid_size)
batch_fn = collate_aishell3_examples
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pathlib import Path
import numpy as np
import paddle
import soundfile as sf
from matplotlib import pyplot as plt
from parakeet.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_phones
from parakeet.exps.voice_cloning.tacotron2_ge2e.aishell3 import voc_tones
from parakeet.exps.voice_cloning.tacotron2_ge2e.chinese_g2p import convert_sentence
from parakeet.models.lstm_speaker_encoder import LSTMSpeakerEncoder
from parakeet.models.tacotron2 import Tacotron2
from parakeet.models.waveflow import ConditionalWaveFlow
from parakeet.utils import display
def voice_cloning(args):
# speaker encoder
p = SpeakerVerificationPreprocessor(
sampling_rate=16000,
audio_norm_target_dBFS=-30,
vad_window_length=30,
vad_moving_average_width=8,
vad_max_silence_length=6,
mel_window_length=25,
mel_window_step=10,
n_mels=40,
partial_n_frames=160,
min_pad_coverage=0.75,
partial_overlap_ratio=0.5)
print("Audio Processor Done!")
speaker_encoder = LSTMSpeakerEncoder(
n_mels=40, num_layers=3, hidden_size=256, output_size=256)
speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
speaker_encoder.eval()
print("GE2E Done!")
synthesizer = Tacotron2(
vocab_size=68,
n_tones=10,
d_mels=80,
d_encoder=512,
encoder_conv_layers=3,
encoder_kernel_size=5,
d_prenet=256,
d_attention_rnn=1024,
d_decoder_rnn=1024,
attention_filters=32,
attention_kernel_size=31,
d_attention=128,
d_postnet=512,
postnet_kernel_size=5,
postnet_conv_layers=5,
reduction_factor=1,
p_encoder_dropout=0.5,
p_prenet_dropout=0.5,
p_attention_dropout=0.1,
p_decoder_dropout=0.1,
p_postnet_dropout=0.5,
d_global_condition=256,
use_stop_token=False, )
synthesizer.set_state_dict(paddle.load(args.tacotron2_params_path))
synthesizer.eval()
print("Tacotron2 Done!")
# vocoder
vocoder = ConditionalWaveFlow(
upsample_factors=[16, 16],
n_flows=8,
n_layers=8,
n_group=16,
channels=128,
n_mels=80,
kernel_size=[3, 3])
vocoder.set_state_dict(paddle.load(args.waveflow_params_path))
vocoder.eval()
print("WaveFlow Done!")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
input_dir = Path(args.input_dir)
# 因为 AISHELL-3 数据集中使用 % 和 $ 表示韵律词和韵律短语的边界,它们大约对应着较短和较长的停顿,在文本中可以使用 % 和 $ 来调节韵律。
# 值得的注意的是,句子的有效字符集仅包含汉字和 %, $, 因此输入的句子只能包含这些字符。
sentence = "每当你觉得%想要批评什么人的时候$你切要记着%这个世界上的人%并非都具备你禀有的条件$"
phones, tones = convert_sentence(sentence)
phones = np.array(
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
tones = np.array([voc_tones.lookup(item) for item in tones], dtype=np.int64)
phones = paddle.to_tensor(phones).unsqueeze(0)
tones = paddle.to_tensor(tones).unsqueeze(0)
for name in os.listdir(input_dir):
utt_id = name.split(".")[0]
ref_audio_path = input_dir / name
mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
print("mel_sequences: ", mel_sequences.shape)
with paddle.no_grad():
embed = speaker_encoder.embed_utterance(
paddle.to_tensor(mel_sequences))
print("embed shape: ", embed.shape)
utterance_embeds = paddle.unsqueeze(embed, 0)
outputs = synthesizer.infer(
phones, tones=tones, global_condition=utterance_embeds)
mel_input = paddle.transpose(outputs["mel_outputs_postnet"], [0, 2, 1])
alignment = outputs["alignments"][0].numpy().T
display.plot_alignment(alignment)
plt.savefig(str(output_dir / (utt_id + ".png")))
with paddle.no_grad():
wav = vocoder.infer(mel_input)
wav = wav.numpy()[0]
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=22050)
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="")
parser.add_argument(
"--ge2e_params_path", type=str, help="ge2e params path.")
parser.add_argument(
"--tacotron2_params_path", type=str, help="tacotron2 params path.")
parser.add_argument(
"--waveflow_params_path", type=str, help="waveflow params path.")
parser.add_argument(
"--device", type=str, default="gpu", help="device type to use.")
parser.add_argument(
"--input-dir",
type=str,
help="input dir of *.wav, the sample rate will be resample to 16k.")
parser.add_argument("--output-dir", type=str, help="output dir.")
args = parser.parse_args()
paddle.set_device(args.device)
voice_cloning(args)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from yacs.config import CfgNode as CN
_C = CN()
......
......@@ -11,14 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
import numpy as np
import pandas
from paddle.io import Dataset
from parakeet.data.batch import batch_spec, batch_wav
from parakeet.data.batch import batch_spec
from parakeet.data.batch import batch_wav
class LJSpeech(Dataset):
......
......@@ -11,20 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import os
from pathlib import Path
import tqdm
import numpy as np
import librosa
import numpy as np
import pandas as pd
import tqdm
from parakeet.datasets import LJSpeechMetaData
from parakeet.audio import LogMagnitude
from config import get_cfg_defaults
from parakeet.datasets import LJSpeechMetaData
from parakeet.exps.waveflow.config import get_cfg_defaults
class Transform(object):
......
......@@ -11,20 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import argparse
import os
from pathlib import Path
import numpy as np
import soundfile as sf
import paddle
import soundfile as sf
from parakeet.exps.waveflow.config import get_cfg_defaults
from parakeet.models.waveflow import ConditionalWaveFlow
from parakeet.utils import layer_tools
from config import get_cfg_defaults
def main(config, args):
paddle.set_device(args.device)
......
......@@ -11,22 +11,24 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader, DistributedBatchSampler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from parakeet.data import dataset
from parakeet.models.waveflow import ConditionalWaveFlow, WaveFlowLoss
from parakeet.utils import mp_tools
from parakeet.exps.waveflow.config import get_cfg_defaults
from parakeet.exps.waveflow.ljspeech import LJSpeech
from parakeet.exps.waveflow.ljspeech import LJSpeechClipCollector
from parakeet.exps.waveflow.ljspeech import LJSpeechCollector
from parakeet.models.waveflow import ConditionalWaveFlow
from parakeet.models.waveflow import WaveFlowLoss
from parakeet.training.cli import default_argument_parser
from parakeet.training.experiment import ExperimentBase
from config import get_cfg_defaults
from ljspeech import LJSpeech, LJSpeechClipCollector, LJSpeechCollector
from parakeet.utils import mp_tools
class Experiment(ExperimentBase):
......
......@@ -11,11 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .zh_normalization import *
from .generate_lexicon import *
from .normalizer import *
from .phonectic import *
from .punctuation import *
from .tone_sandhi import *
from .vocab import *
from .zh_normalization import *
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.normalizer.normalizer import *
from parakeet.frontend.normalizer.numbers import *
......@@ -17,9 +17,9 @@ from abc import abstractmethod
from g2p_en import G2p
from g2pM import G2pM
from parakeet.frontend.vocab import Vocab
from parakeet.frontend.normalizer.normalizer import normalize
from parakeet.frontend.punctuation import get_punctuations
from parakeet.frontend.vocab import Vocab
# discard opencc untill we find an easy solution to install it on windows
# from opencc import OpenCC
......
......@@ -22,9 +22,9 @@ from g2pM import G2pM
from pypinyin import lazy_pinyin
from pypinyin import Style
from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer
from parakeet.frontend.generate_lexicon import generate_lexicon
from parakeet.frontend.tone_sandhi import ToneSandhi
from parakeet.frontend.zh_normalization.text_normlization import TextNormalizer
class Frontend():
......
......@@ -11,5 +11,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parakeet.frontend.zh_normalization.text_normlization import *
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fastspeech2 import *
from .tacotron2 import *
from .transformer_tts import *
......
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fastspeech2 import *
from .fastspeech2_updater import *
......@@ -28,10 +28,10 @@ from parakeet.modules.fastspeech2_predictor.variance_predictor import VariancePr
from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncoding
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
from parakeet.modules.fastspeech2_transformer.encoder import Encoder as TransformerEncoder
from parakeet.modules.tacotron2.decoder import Postnet
from parakeet.modules.nets_utils import initialize
from parakeet.modules.nets_utils import make_non_pad_mask
from parakeet.modules.nets_utils import make_pad_mask
from parakeet.modules.tacotron2.decoder import Postnet
class FastSpeech2(nn.Layer):
......
......@@ -14,6 +14,7 @@
import logging
from paddle import distributed as dist
from parakeet.models.fastspeech2 import FastSpeech2Loss
from parakeet.training.extensions.evaluator import StandardEvaluator
from parakeet.training.reporter import report
......
......@@ -106,10 +106,10 @@ class LSTMSpeakerEncoder(nn.Layer):
def do_gradient_ops(self):
for p in [self.similarity_weight, self.similarity_bias]:
g = p._grad_ivar()
g[...] = g * 0.01
g = g * 0.01
def inv_argmax(self, i, num):
return np.eye(1, num, i, dtype=np.int)[0]
return np.eye(1, num, i, dtype=int)[0]
def loss(self, embeds):
"""
......
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .parallel_wavegan import *
from .parallel_wavegan_updater import *
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import Dict
......@@ -21,11 +20,12 @@ from paddle.io import DataLoader
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddle.optimizer.lr import LRScheduler
from timer import timer
from parakeet.training.extensions.evaluator import StandardEvaluator
from parakeet.training.reporter import report
from parakeet.training.updaters.standard_updater import StandardUpdater
from parakeet.training.updaters.standard_updater import UpdaterState
from timer import timer
logging.basicConfig(
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
datefmt='[%Y-%m-%d %H:%M:%S]')
......
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .speedyspeech import *
from .speedyspeech_updater import *
......@@ -17,6 +17,7 @@ import paddle
from paddle import distributed as dist
from paddle.fluid.layers import huber_loss
from paddle.nn import functional as F
from parakeet.modules.losses import masked_l1_loss
from parakeet.modules.losses import weighted_mean
from parakeet.modules.ssim import ssim
......
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .transformer_tts import *
from .transformer_tts_updater import *
......@@ -15,10 +15,11 @@
from typing import Dict
from typing import Sequence
from typing import Tuple
import numpy
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import nn
from typeguard import check_argument_types
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention
......@@ -27,13 +28,13 @@ from parakeet.modules.fastspeech2_transformer.embedding import PositionalEncodin
from parakeet.modules.fastspeech2_transformer.embedding import ScaledPositionalEncoding
from parakeet.modules.fastspeech2_transformer.encoder import Encoder
from parakeet.modules.fastspeech2_transformer.mask import subsequent_mask
from parakeet.modules.nets_utils import initialize
from parakeet.modules.nets_utils import make_non_pad_mask
from parakeet.modules.nets_utils import make_pad_mask
from parakeet.modules.style_encoder import StyleEncoder
from parakeet.modules.tacotron2.decoder import Postnet
from parakeet.modules.tacotron2.decoder import Prenet as DecoderPrenet
from parakeet.modules.tacotron2.encoder import Encoder as EncoderPrenet
from parakeet.modules.nets_utils import initialize
from parakeet.modules.nets_utils import make_non_pad_mask
from parakeet.modules.nets_utils import make_pad_mask
class TransformerTTS(nn.Layer):
......
......@@ -16,6 +16,7 @@ from typing import Sequence
import paddle
from paddle import distributed as dist
from parakeet.models.transformer_tts import GuidedMultiHeadAttentionLoss
from parakeet.models.transformer_tts import TransformerTTSLoss
from parakeet.training.extensions.evaluator import StandardEvaluator
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .attention import *
from .conv import *
from .geometry import *
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 暂时删除了 dyminic conv
"""Decoder definition."""
import logging
......
......@@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Decoder self-attention layer definition."""
import paddle
from paddle import nn
from parakeet.modules.layer_norm import LayerNorm
......
......@@ -12,11 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Lightweight Convolution Module."""
import numpy
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle import nn
from parakeet.modules.glu import GLU
from parakeet.modules.masked_fill import masked_fill
......
......@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Mask module."""
import paddle
......
......@@ -12,12 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Style encoder of GST-Tacotron."""
from typeguard import check_argument_types
from typing import Sequence
import paddle
from paddle import nn
from typeguard import check_argument_types
from parakeet.modules.fastspeech2_transformer.attention import MultiHeadedAttention as BaseMultiHeadedAttention
......
......@@ -12,9 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tacotron2 decoder related modules."""
import six
import paddle.nn.functional as F
import six
from paddle import nn
......
......@@ -12,10 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tacotron2 encoder related modules."""
import six
import paddle
import six
from paddle import nn
......
......@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import nn
from parakeet.modules import attention as attn
from paddle.nn import functional as F
from parakeet.modules import attention as attn
__all__ = [
"PositionwiseFFN",
"TransformerEncoderLayer",
......
......@@ -11,6 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .cli import *
from .experiment import *
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
optim_classes = dict(
......
......@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from . import checkpoint
from . import display
from . import layer_tools
......
......@@ -11,8 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import paddle
# A global variable to record the number of calling times for profiler
......
......@@ -15,7 +15,6 @@ import contextlib
import inspect
import io
import os
import re
import subprocess as sp
import sys
from pathlib import Path
......@@ -84,7 +83,7 @@ def _post_install(install_lib_dir):
tools_extrs_dir = HERE / 'tools/extras'
with pushd(tools_extrs_dir):
print(os.getcwd())
check_call(f"./install_autolog.sh")
check_call("./install_autolog.sh")
print("autolog install.")
# ctcdecoder
......
......@@ -4,8 +4,8 @@
```
即可运行.
执行逻辑:
1. cd 到 ../../../ (也就是 Parakeet 目录)
1. cd 到 ../../../ (也就是 Deepspeech 目录)
2. 安装 parakeet 所需依赖
3. 从 bos 下载数据集并解压缩
4. 预处理数据集为训练 pwg 所需格式,保存到 Parakeet/dump 文件夹底下
4. 预处理数据集为训练 pwg 所需格式,保存到 Deepspeech/dump 文件夹底下
5. 按照不同的参数执行 run_benchmark.sh 脚本
......@@ -10,6 +10,9 @@ cd ../../../
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
sudo apt-get install libsndfile1
pip install -e .
pushd examples/csmsc/voc1
source path.sh
popd
fi
# 2 拷贝该模型需要数据、预训练模型
# 下载 baker 数据集到 home 目录下并解压缩到 home 目录下
......@@ -22,15 +25,14 @@ fi
# 数据预处理
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python examples/GANVocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml
python utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
python examples/GANVocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy
python examples/GANVocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy
python examples/GANVocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
python3 parakeet/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=examples/csmsc/voc1/conf/default.yaml
python3 utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/train/raw/metadata.jsonl --dumpdir=dump/train/norm --stats=dump/train/feats_stats.npy
python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/dev/raw/metadata.jsonl --dumpdir=dump/dev/norm --stats=dump/train/feats_stats.npy
python3 parakeet/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy
fi
# 3 批量运行(如不方便批量,1,2需放到单个模型中)
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
model_mode_list=(pwg)
fp_item_list=(fp32)
# 满 bs 是 26
......@@ -40,7 +42,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for bs_item in ${bs_item_list[@]}; do
echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp
CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/PWGAN/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min)
CUDA_VISIBLE_DEVICES=0 bash tests/benchmark/pwgan/run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 100 ${model_mode} # (5min)
sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp
......
......@@ -24,13 +24,13 @@ function _train(){
--max-iter=${max_iter}
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=examples/GANVocoder/parallelwave_gan/baker/conf/default.yaml \
--config=examples/csmsc/voc1/conf/default.yaml \
--output-dir=exp/default \
--run-benchmark=true"
case ${run_mode} in
sp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;;
mp) train_cmd="python3 examples/GANVocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}"
sp) train_cmd="python3 parakeet/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=1 ${train_cmd}" ;;
mp) train_cmd="python3 parakeet/exps/gan_vocoder/parallelwave_gan/train.py --nprocs=8 ${train_cmd}"
log_parse_file="mylog/workerlog.0" ;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -13,7 +13,7 @@ null:null
null:null
##
trainer:norm_train
norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --batch_size=32 --max_epoch=20 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
null:null
null:null
null:null
......@@ -21,7 +21,7 @@ null:null
null:null
##
===========================eval_params===========================
eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_20.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
null:null
##
===========================infer_params===========================
......@@ -37,7 +37,7 @@ null:null
null:null
null:null
null:null
inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
null:null
null:null
null:null
......
......@@ -13,7 +13,7 @@ null:null
null:null
##
trainer:norm_train
norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/mini_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/mini_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --batch_size=32 --max_epoch=10 --num_snapshots=10 --output-dir=exp/default --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt --use-relative-path=True
null:null
null:null
null:null
......@@ -21,7 +21,7 @@ null:null
null:null
##
===========================eval_params===========================
eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=../../examples/speedyspeech/baker/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=../../../examples/csmsc/tts2/conf/default.yaml --speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_30.pdz --speedyspeech-stat=train_data/mini_BZNSYP/train/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=train_data/mini_BZNSYP/phone_id_map.txt --tones-dict=train_data/mini_BZNSYP/tone_id_map.txt
null:null
##
===========================infer_params===========================
......@@ -37,7 +37,7 @@ null:null
null:null
null:null
null:null
inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
--use_gpu:True
null:null
null:null
......
......@@ -13,7 +13,7 @@ null:null
null:null
##
trainer:norm_train
norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
null:null
null:null
null:null
......@@ -21,7 +21,7 @@ null:null
null:null
##
===========================eval_params===========================
eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
null:null
##
===========================infer_params===========================
......@@ -37,7 +37,7 @@ null:null
null:null
null:null
null:null
inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
null:null
null:null
null:null
......
......@@ -13,7 +13,7 @@ null:null
null:null
##
trainer:norm_train
norm_train:../../examples/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../examples/speedyspeech/baker/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
norm_train:../../../parakeet/exps/speedyspeech/train.py --train-metadata=train_data/processed_BZNSYP/train/norm/metadata.jsonl --dev-metadata=train_data/processed_BZNSYP/dev/norm/metadata.jsonl --config=../../../examples/csmsc/tts2/conf/default.yaml --output-dir=exp/whole --phones-dict=train_data/processed_BZNSYP/phone_id_map.txt --tones-dict=train_data/processed_BZNSYP/tone_id_map.txt --use-relative-path=True
null:null
null:null
null:null
......@@ -21,7 +21,7 @@ null:null
null:null
##
===========================eval_params===========================
eval:../../examples/speedyspeech/baker/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../examples/speedyspeech/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
eval:../../../parakeet/exps/speedyspeech/synthesize_e2e.py --speedyspeech-config=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/default.yaml --speedyspeech-checkpoint=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz --speedyspeech-stat=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy --pwg-config=pretrain_models/pwg_baker_ckpt_0.4/pwg_default.yaml --pwg-checkpoint=pretrain_models/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz --pwg-stat=pretrain_models/pwg_baker_ckpt_0.4/pwg_stats.npy --text=../../../parakeet/exps/sentences.txt --output-dir=e2e --inference-dir=inference --device="gpu" --phones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
null:null
##
===========================infer_params===========================
......@@ -37,7 +37,7 @@ null:null
null:null
null:null
null:null
inference:../../examples/speedyspeech/baker/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../examples/speedyspeech/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
inference:../../../parakeet/exps/speedyspeech/inference.py --inference-dir=pretrain_models/speedyspeech_pwg_inference_0.5 --text=../../../parakeet/exps/sentences.txt --output-dir=inference_out --enable-auto-log --phones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/phone_id_map.txt --tones-dict=pretrain_models/speedyspeech_pwg_inference_0.5/tone_id_map.txt
null:null
null:null
null:null
......
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
......@@ -4,7 +4,6 @@
# 2018 Xuankai Chang (Shanghai Jiao Tong University)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import argparse
import json
import logging
import sys
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册