diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml index 3a57e902607f4de641dc36960ad928d590dc5fae..ac4956742ebf568a971c24ac93b6ba0c19765211 100644 --- a/examples/aishell3/tts3/conf/default.yaml +++ b/examples/aishell3/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type @@ -84,7 +84,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml index 557a5a0a1cd04b4cd4f5e2be57e71f42ba457078..ac4956742ebf568a971c24ac93b6ba0c19765211 100644 --- a/examples/aishell3/vc1/conf/default.yaml +++ b/examples/aishell3/vc1/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index 7fbffbdde0164ea2040b5060842c962fb021b679..e2102d6e7e6949812dc69aacc03aa09f5578cc80 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..42635c506ee12faed6facf9ef04fd31f309ff4d7 --- /dev/null +++ b/examples/csmsc/tts0/conf/default.yaml @@ -0,0 +1,91 @@ +# This configuration is for Paddle to train Tacotron 2. Compared to the +# original paper, this configuration additionally use the guided attention +# loss to accelerate the learning of the diagonal attention. It requires +# only a single GPU with 12 GB memory and it takes ~1 days to finish the +# training on Titan V. + +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 64 +num_workers: 2 + +########################################################### +# MODEL SETTING # +########################################################### +model: # keyword arguments for the selected model + embed_dim: 512 # char or phn embedding dimension + elayers: 1 # number of blstm layers in encoder + eunits: 512 # number of blstm units + econv_layers: 3 # number of convolutional layers in encoder + econv_chans: 512 # number of channels in convolutional layer + econv_filts: 5 # filter size of convolutional layer + atype: location # attention function type + adim: 512 # attention dimension + aconv_chans: 32 # number of channels in convolutional layer of attention + aconv_filts: 15 # filter size of convolutional layer of attention + cumulate_att_w: True # whether to cumulate attention weight + dlayers: 2 # number of lstm layers in decoder + dunits: 1024 # number of lstm units in decoder + prenet_layers: 2 # number of layers in prenet + prenet_units: 256 # number of units in prenet + postnet_layers: 5 # number of layers in postnet + postnet_chans: 512 # number of channels in postnet + postnet_filts: 5 # filter size of postnet layer + output_activation: null # activation function for the final output + use_batch_norm: True # whether to use batch normalization in encoder + use_concate: True # whether to concatenate encoder embedding with decoder outputs + use_residual: False # whether to use residual connection in encoder + dropout_rate: 0.5 # dropout rate + zoneout_rate: 0.1 # zoneout rate + reduction_factor: 1 # reduction factor + spk_embed_dim: null # speaker embedding dimension + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation + use_guided_attn_loss: True # whether to use guided attention loss + guided_attn_loss_sigma: 0.4 # sigma of guided attention loss + guided_attn_loss_lambda: 1.0 # strength of guided attention loss + + +########################################################## +# OPTIMIZER SETTING # +########################################################## +optimizer: + optim: adam # optimizer type + learning_rate: 1.0e-03 # learning rate + epsilon: 1.0e-06 # epsilon + weight_decay: 0.0 # weight decay coefficient + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 200 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +seed: 42 \ No newline at end of file diff --git a/examples/csmsc/tts0/local/preprocess.sh b/examples/csmsc/tts0/local/preprocess.sh new file mode 100755 index 0000000000000000000000000000000000000000..8a4b8dd94429074e98b7e90a7aea1148269b70ec --- /dev/null +++ b/examples/csmsc/tts0/local/preprocess.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +stage=0 +stop_stage=100 + +config_path=$1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # get durations from MFA's result + echo "Generate durations.txt from MFA results ..." + python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ + --inputdir=./baker_alignment_tone \ + --output=durations.txt \ + --config=${config_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # extract features + echo "Extract features ..." + python3 ${BIN_DIR}/preprocess.py \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ \ + --dumpdir=dump \ + --dur-file=durations.txt \ + --config=${config_path} \ + --num-cpu=20 \ + --cut-sil=True +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # get features' stats(mean and std) + echo "Get features' stats ..." + python3 ${MAIN_ROOT}/utils/compute_statistics.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --field-name="speech" + +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # normalize and covert phone to id, dev and test should use train's stats + echo "Normalize ..." + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/train/raw/metadata.jsonl \ + --dumpdir=dump/train/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/dev/raw/metadata.jsonl \ + --dumpdir=dump/dev/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt + + python3 ${BIN_DIR}/normalize.py \ + --metadata=dump/test/raw/metadata.jsonl \ + --dumpdir=dump/test/norm \ + --speech-stats=dump/train/speech_stats.npy \ + --phones-dict=dump/phone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh new file mode 100755 index 0000000000000000000000000000000000000000..4be06dd80558945d70ddaa65e48a0341bfe4372b --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +FLAGS_allocator_strategy=naive_best_fit \ +FLAGS_fraction_of_gpu_memory_to_use=0.01 \ +python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh new file mode 100755 index 0000000000000000000000000000000000000000..fe5d11d4400876c050fcc4ac48c7dc1286a65778 --- /dev/null +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -0,0 +1,91 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 + +stage=0 +stop_stage=0 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ + --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ + --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi + +# the pretrained models haven't release now +# style melgan +# style melgan's Dygraph to Static Graph is not ready now +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --phones_dict=dump/phone_id_map.txt + # --inference_dir=${train_output_path}/inference +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn_e2e" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --lang=zh \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/test_e2e \ + --inference_dir=${train_output_path}/inference \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/train.sh b/examples/csmsc/tts0/local/train.sh new file mode 100755 index 0000000000000000000000000000000000000000..f90db91505d7ff337824fc716212f566754cb5d8 --- /dev/null +++ b/examples/csmsc/tts0/local/train.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --train-metadata=dump/train/norm/metadata.jsonl \ + --dev-metadata=dump/dev/norm/metadata.jsonl \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 \ + --phones-dict=dump/phone_id_map.txt \ No newline at end of file diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh new file mode 100755 index 0000000000000000000000000000000000000000..9cdbe256e11a068a5471fed1ac392804cf615ac1 --- /dev/null +++ b/examples/csmsc/tts0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=new_tacotron2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} diff --git a/examples/csmsc/tts0/run.sh b/examples/csmsc/tts0/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..86800920d68499f8249b66ac51a5ad8d9876bf5d --- /dev/null +++ b/examples/csmsc/tts0/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e +source path.sh + +gpus=0,1 +stage=0 +stop_stage=100 + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_153.pdz + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/preprocess.sh ${conf_path} || exit -1 +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # synthesize, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # synthesize_e2e, vocoder is pwgan + CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 +fi diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml index 252f634d8deff7a14ebf42e284027db78642cc8a..fcad86150a88164c3869c037c4bbf4c520d389c0 100644 --- a/examples/csmsc/tts3/conf/conformer.yaml +++ b/examples/csmsc/tts3/conf/conformer.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -53,8 +53,8 @@ model: conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type conformer_activation_type: swish # conformer activation type - use_macaron_style_in_conformer: true # whether to use macaron style in conformer - use_cnn_in_conformer: true # whether to use CNN in conformer + use_macaron_style_in_conformer: True # whether to use macaron style in conformer + use_cnn_in_conformer: True # whether to use CNN in conformer conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder init_type: xavier_uniform # initialization type @@ -70,14 +70,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml index 1f723d67cd6051ee885cd2f909483d0a2aed6438..2c2a1ea1009a08053a627c38e7b0bb8d904168b6 100644 --- a/examples/csmsc/tts3/conf/default.yaml +++ b/examples/csmsc/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder @@ -82,7 +82,6 @@ updater: use_masking: True # whether to apply masking for padded part in loss calculation - ########################################################### # OPTIMIZER SETTING # ########################################################### diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index c1ddd3b98629c8b645222db6754bf031b97f3712..8f06e933cccfd77113c4b72956f28ff74aec2037 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -18,7 +18,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/preprocess.sh ${conf_path} || exit -1 + ./local/preprocess.sh ${conf_path} || exit -1 fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 28d218ff3655dc3a0e74adaf43d4efd2221d9447..703be21b35dc965645f43fc605f29202c97c1c01 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -34,10 +34,10 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - bias: true # use bias in residual blocks - use_weight_norm: true # Whether to use weight norm. + bias: True # use bias in residual blocks + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. - use_causal_conv: false # use causal conv in residual blocks and upsample layers + use_causal_conv: False # use causal conv in residual blocks and upsample layers upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size. interpolate_mode: "nearest" # upsample net interpolate mode freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis @@ -53,8 +53,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index 27e97664aa2eaf16a1e0b5c154dcf028d58cc4ee..fbff54f193f074402cde9d571aec51f880fc3deb 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss. win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index a3b1d8b113f8c5647d5942b0c270dede1b90593b..0a38c28200e16a7a73c76eb24562b1d6e30c8454 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -63,13 +63,13 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss win_lengths: [600, 1200, 240] # List of window length for STFT-based loss. window: "hann" # Window function for STFT-based loss -use_subband_stft_loss: true +use_subband_stft_loss: True subband_stft_loss_params: fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. hop_sizes: [30, 60, 10] # List of hop size for STFT-based loss. @@ -79,7 +79,7 @@ subband_stft_loss_params: ########################################################### # ADVERSARIAL LOSS SETTING # ########################################################### -use_feat_match_loss: false # Whether to use feature matching loss. +use_feat_match_loss: False # Whether to use feature matching loss. lambda_adv: 2.5 # Loss balancing coefficient for adversarial loss. ########################################################### diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml index c9abf78dc220ce0e976baecbb8f6987307fc17c1..cd8f8e2865dc3d0db1c44f974c6057073b12cc38 100644 --- a/examples/csmsc/voc4/conf/default.yaml +++ b/examples/csmsc/voc4/conf/default.yaml @@ -65,7 +65,7 @@ discriminator_params: ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: true +use_stft_loss: True stft_loss_params: fft_sizes: [1024, 2048, 512] # List of FFT size for STFT-based loss. hop_sizes: [120, 240, 50] # List of hop size for STFT-based loss @@ -78,9 +78,9 @@ lambda_aux: 1.0 # Loss balancing coefficient for aux loss. ########################################################### lambda_adv: 1.0 # Loss balancing coefficient for adv loss. generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. ########################################################### # DATA LOADER SETTING # diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml index f42fc385acf5bd527d6b2144c39242efe7ed800c..38b94cf5c2416a480ed3d401e804644f0e8008e5 100644 --- a/examples/csmsc/voc5/conf/default.yaml +++ b/examples/csmsc/voc5/conf/default.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml index 73420625111641ffc088ecf34af528c5d13becd3..110ae052bddcfe1c31c9b138f5436533d5d6bb74 100644 --- a/examples/csmsc/voc5/conf/finetune.yaml +++ b/examples/csmsc/voc5/conf/finetune.yaml @@ -35,12 +35,12 @@ generator_params: - [1, 3, 5] - [1, 3, 5] - [1, 3, 5] - use_additional_convs: true # Whether to use additional conv layer in residual blocks. - bias: true # Whether to use bias parameter in conv. + use_additional_convs: True # Whether to use additional conv layer in residual blocks. + bias: True # Whether to use bias parameter in conv. nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. + use_weight_norm: True # Whether to apply weight normalization. ########################################################### @@ -60,12 +60,12 @@ discriminator_params: channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. - bias: true + bias: True downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: negative_slope: 0.1 - follow_official_norm: true # Whether to follow the official norm setting. + follow_official_norm: True # Whether to follow the official norm setting. periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. period_discriminator_params: in_channels: 1 # Number of input channels. @@ -74,19 +74,19 @@ discriminator_params: channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. - bias: true # Whether to use bias parameter in conv layer." + bias: True # Whether to use bias parameter in conv layer." nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation_params: # Nonlinear activation paramters. negative_slope: 0.1 - use_weight_norm: true # Whether to apply weight normalization. - use_spectral_norm: false # Whether to apply spectral normalization. + use_weight_norm: True # Whether to apply weight normalization. + use_spectral_norm: False # Whether to apply spectral normalization. ########################################################### # STFT LOSS SETTING # ########################################################### -use_stft_loss: false # Whether to use multi-resolution STFT loss. -use_mel_loss: true # Whether to use Mel-spectrogram loss. +use_stft_loss: False # Whether to use multi-resolution STFT loss. +use_mel_loss: True # Whether to use Mel-spectrogram loss. mel_loss_params: fs: 24000 fft_size: 2048 @@ -98,14 +98,14 @@ mel_loss_params: fmax: 12000 log_base: null generator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. + average_by_discriminators: False # Whether to average loss by #discriminators. discriminator_adv_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. -use_feat_match_loss: true + average_by_discriminators: False # Whether to average loss by #discriminators. +use_feat_match_loss: True feat_match_loss_params: - average_by_discriminators: false # Whether to average loss by #discriminators. - average_by_layers: false # Whether to average loss by #layers in each discriminator. - include_final_outputs: false # Whether to include final outputs in feat match loss calculation. + average_by_discriminators: False # Whether to average loss by #discriminators. + average_by_layers: False # Whether to average loss by #layers in each discriminator. + include_final_outputs: False # Whether to include final outputs in feat match loss calculation. ########################################################### # ADVERSARIAL LOSS SETTING # diff --git a/examples/ljspeech/tts1/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml index 6b495effc8d0f86aa3e872f056eb4cd259eb3327..456b6a1e35353c74b3fd24fc9644a00944d69326 100644 --- a/examples/ljspeech/tts1/conf/default.yaml +++ b/examples/ljspeech/tts1/conf/default.yaml @@ -63,9 +63,9 @@ model: # keyword arguments for the selected model # UPDATER SETTING # ########################################################### updater: - use_masking: true # whether to apply masking for padded part in loss calculation + use_masking: True # whether to apply masking for padded part in loss calculation loss_type: L1 - use_guided_attn_loss: true # whether to use guided attention loss + use_guided_attn_loss: True # whether to use guided attention loss guided_attn_loss_sigma: 0.4 # sigma in guided attention loss guided_attn_loss_lambda: 10.0 # lambda in guided attention loss modules_applied_guided_attn: ["encoder-decoder"] # modules to apply guided attention loss diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml index 872dafcbe35aa10fa72c90f1eead4bbc242e3ebb..5305c912f9127b59713907f408acb15132e20ea2 100644 --- a/examples/ljspeech/tts3/conf/default.yaml +++ b/examples/ljspeech/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index 2d39beb795dce888cb2fbc295bcdad8b22bd19f6..d30960d657db160b1c6546c596c9475df3107388 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 4, 4, 4] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml index 2738e7c224514ac948107511e4679e93e4f75721..1bca9107b5eda75c17dd33656c092373f0758831 100644 --- a/examples/vctk/tts3/conf/default.yaml +++ b/examples/vctk/tts3/conf/default.yaml @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis. n_mels: 80 # The number of mel basis. # Only used for the model using pitch features (e.g. FastSpeech2) -f0min: 80 # Maximum f0 for pitch extraction. -f0max: 400 # Minimum f0 for pitch extraction. +f0min: 80 # Minimum f0 for pitch extraction. +f0max: 400 # Maximum f0 for pitch extraction. ########################################################### @@ -64,14 +64,14 @@ model: pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch - stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy - stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index 59ce3825dcc55c9a7cc2e9299ebfa4a351189d79..af859d4cccca0f411a433fc4644ef56733020343 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -33,7 +33,7 @@ generator_params: aux_context_window: 2 # Context window size for auxiliary feature. # If set to 2, previous 2 and future 2 frames will be considered. dropout: 0.0 # Dropout rate. 0.0 means no dropout applied. - use_weight_norm: true # Whether to use weight norm. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift @@ -46,8 +46,8 @@ discriminator_params: kernel_size: 3 # Number of output channels. layers: 10 # Number of conv layers. conv_channels: 64 # Number of chnn layers. - bias: true # Whether to use bias parameter in conv. - use_weight_norm: true # Whether to use weight norm. + bias: True # Whether to use bias parameter in conv. + use_weight_norm: True # Whether to use weight norm. # If set to true, it will be applied to all of the conv layers. nonlinear_activation: "leakyrelu" # Nonlinear function after each conv. nonlinear_activation_params: # Nonlinear function parameters diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 526871a232d3241806377c16b459cfe42396b4df..2fcb46d9e281ef4ef14d61adbd119e21d9bc94d0 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -17,6 +17,35 @@ import paddle from paddlespeech.t2s.data.batch import batch_sequences +def tacotron2_single_spk_batch_fn(examples): + # fields = ["text", "text_lengths", "speech", "speech_lengths"] + text = [np.array(item["text"], dtype=np.int64) for item in examples] + speech = [np.array(item["speech"], dtype=np.float32) for item in examples] + text_lengths = [ + np.array(item["text_lengths"], dtype=np.int64) for item in examples + ] + speech_lengths = [ + np.array(item["speech_lengths"], dtype=np.int64) for item in examples + ] + + text = batch_sequences(text) + speech = batch_sequences(speech) + + # convert each batch to paddle.Tensor + text = paddle.to_tensor(text) + speech = paddle.to_tensor(speech) + text_lengths = paddle.to_tensor(text_lengths) + speech_lengths = paddle.to_tensor(speech_lengths) + + batch = { + "text": text, + "text_lengths": text_lengths, + "speech": speech, + "speech_lengths": speech_lengths, + } + return batch + + def speedyspeech_single_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] @@ -56,7 +85,7 @@ def speedyspeech_single_spk_batch_fn(examples): def speedyspeech_multi_spk_batch_fn(examples): - # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations", "spk_id"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] tones = [np.array(item["tones"], dtype=np.int64) for item in examples] feats = [np.array(item["feats"], dtype=np.float32) for item in examples] diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 4ddd19f72b4bd52aa1f7f64ae22614e5c4efc5d4..13569b9995f6b723f3bd7f18617622d65bdf04fb 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -15,14 +15,14 @@ # for mb melgan finetune # 长度和原本的 mel 不一致怎么办? import argparse +import os from pathlib import Path import numpy as np import paddle import yaml -from yacs.config import CfgNode from tqdm import tqdm -import os +from yacs.config import CfgNode from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -50,11 +50,14 @@ def evaluate(args, fastspeech2_config): spk_id_list = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id_list) else: - spk_num=None + spk_num = None odim = fastspeech2_config.n_mels model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num) + idim=vocab_size, + odim=odim, + **fastspeech2_config["model"], + spk_num=spk_num) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) @@ -99,9 +102,15 @@ def evaluate(args, fastspeech2_config): else: train_wav_files += wav_files - train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] - dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] - test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] @@ -122,7 +131,8 @@ def evaluate(args, fastspeech2_config): phone_ids = paddle.to_tensor(np.array(phone_ids)) if args.speaker_dict: - speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) speaker_id = paddle.to_tensor(speaker_id) else: speaker_id = None @@ -143,7 +153,8 @@ def evaluate(args, fastspeech2_config): sub_output_dir.mkdir(parents=True, exist_ok=True) with paddle.no_grad(): - mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id) + mel = fastspeech2_inference( + phone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -175,12 +186,9 @@ def main(): type=str, default="phone_id_map.txt", help="phone vocabulary file.") - + parser.add_argument( - "--speaker-dict", - type=str, - default=None, - help="speaker id map file.") + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") diff --git a/paddlespeech/t2s/exps/new_tacotron2/__init__.py b/paddlespeech/t2s/exps/new_tacotron2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/exps/new_tacotron2/normalize.py b/paddlespeech/t2s/exps/new_tacotron2/normalize.py new file mode 120000 index 0000000000000000000000000000000000000000..64848f899b1c50acf2bc22993e1cbb54eb5e79ca --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/normalize.py @@ -0,0 +1 @@ +../transformer_tts/normalize.py \ No newline at end of file diff --git a/paddlespeech/t2s/exps/new_tacotron2/preprocess.py b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..5fc6b590d3e4b03a5951793b260729a07bff0bc2 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/preprocess.py @@ -0,0 +1,328 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +from concurrent.futures import ThreadPoolExecutor +from operator import itemgetter +from pathlib import Path +from typing import Any +from typing import Dict +from typing import List + +import jsonlines +import librosa +import numpy as np +import tqdm +import yaml +from yacs.config import CfgNode + +from paddlespeech.t2s.data.get_feats import LogMelFBank +from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length +from paddlespeech.t2s.datasets.preprocess_utils import get_input_token +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence + + +def process_sentence(config: Dict[str, Any], + fp: Path, + sentences: Dict, + output_dir: Path, + mel_extractor=None, + cut_sil: bool=True, + spk_emb_dir: Path=None): + utt_id = fp.stem + # for vctk + if utt_id.endswith("_mic2"): + utt_id = utt_id[:-5] + record = None + if utt_id in sentences: + # reading, resampling may occur + wav, _ = librosa.load(str(fp), sr=config.fs) + if len(wav.shape) != 1 or np.abs(wav).max() > 1.0: + return record + assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio." + assert np.abs(wav).max( + ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM." + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant') + # little imprecise than use *.TextGrid directly + times = librosa.frames_to_time( + d_cumsum, sr=config.fs, hop_length=config.n_shift) + if cut_sil: + start = 0 + end = d_cumsum[-1] + if phones[0] == "sil" and len(durations) > 1: + start = times[1] + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + end = times[-2] + durations = durations[:-1] + phones = phones[:-1] + sentences[utt_id][0] = phones + sentences[utt_id][1] = durations + start, end = librosa.time_to_samples([start, end], sr=config.fs) + wav = wav[start:end] + # extract mel feats + logmel = mel_extractor.get_log_mel_fbank(wav) + # change duration according to mel_length + compare_duration_and_mel_length(sentences, utt_id, logmel) + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + num_frames = logmel.shape[0] + assert sum(durations) == num_frames + mel_dir = output_dir / "data_speech" + mel_dir.mkdir(parents=True, exist_ok=True) + mel_path = mel_dir / (utt_id + "_speech.npy") + np.save(mel_path, logmel) + record = { + "utt_id": utt_id, + "phones": phones, + "text_lengths": len(phones), + "speech_lengths": num_frames, + "speech": str(mel_path), + "speaker": speaker + } + if spk_emb_dir: + if speaker in os.listdir(spk_emb_dir): + embed_name = utt_id + ".npy" + embed_path = spk_emb_dir / speaker / embed_name + if embed_path.is_file(): + record["spk_emb"] = str(embed_path) + else: + return None + return record + + +def process_sentences(config, + fps: List[Path], + sentences: Dict, + output_dir: Path, + mel_extractor=None, + nprocs: int=1, + cut_sil: bool=True, + spk_emb_dir: Path=None): + if nprocs == 1: + results = [] + for fp in fps: + record = process_sentence(config, fp, sentences, output_dir, + mel_extractor, cut_sil, spk_emb_dir) + if record: + results.append(record) + else: + with ThreadPoolExecutor(nprocs) as pool: + futures = [] + with tqdm.tqdm(total=len(fps)) as progress: + for fp in fps: + future = pool.submit(process_sentence, config, fp, + sentences, output_dir, mel_extractor, + cut_sil, spk_emb_dir) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + results = [] + for ft in futures: + record = ft.result() + if record: + results.append(record) + + results.sort(key=itemgetter("utt_id")) + with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer: + for item in results: + writer.write(item) + print("Done") + + +def main(): + # parse config and args + parser = argparse.ArgumentParser( + description="Preprocess audio and then extract features.") + + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now") + + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + + parser.add_argument( + "--dumpdir", + type=str, + required=True, + help="directory to dump feature files.") + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + + parser.add_argument("--config", type=str, help="fastspeech2 config file.") + + parser.add_argument( + "--verbose", + type=int, + default=1, + help="logging level. higher is more logging. (default=1)") + parser.add_argument( + "--num-cpu", type=int, default=1, help="number of process.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + parser.add_argument( + "--spk_emb_dir", + default=None, + type=str, + help="directory to speaker embedding files.") + args = parser.parse_args() + + rootdir = Path(args.rootdir).expanduser() + dumpdir = Path(args.dumpdir).expanduser() + # use absolute path + dumpdir = dumpdir.resolve() + dumpdir.mkdir(parents=True, exist_ok=True) + dur_file = Path(args.dur_file).expanduser() + + if args.spk_emb_dir: + spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve() + else: + spk_emb_dir = None + + assert rootdir.is_dir() + assert dur_file.is_file() + + with open(args.config, 'rt') as f: + config = CfgNode(yaml.safe_load(f)) + + if args.verbose > 1: + print(vars(args)) + print(config) + + sentences, speaker_set = get_phn_dur(dur_file) + + merge_silence(sentences) + phone_id_map_path = dumpdir / "phone_id_map.txt" + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_input_token(sentences, phone_id_map_path, args.dataset) + get_spk_id_map(speaker_set, speaker_id_map_path) + + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + elif args.dataset == "ljspeech": + wav_files = sorted(list((rootdir / "wavs").rglob("*.wav"))) + # split data into 3 sections + num_train = 12900 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "vctk": + sub_num_dev = 5 + wav_dir = rootdir / "wav48_silence_trimmed" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + else: + print("dataset should in {baker, aishell3, ljspeech, vctk} now!") + + train_dump_dir = dumpdir / "train" / "raw" + train_dump_dir.mkdir(parents=True, exist_ok=True) + dev_dump_dir = dumpdir / "dev" / "raw" + dev_dump_dir.mkdir(parents=True, exist_ok=True) + test_dump_dir = dumpdir / "test" / "raw" + test_dump_dir.mkdir(parents=True, exist_ok=True) + + # Extractor + mel_extractor = LogMelFBank( + sr=config.fs, + n_fft=config.n_fft, + hop_length=config.n_shift, + win_length=config.win_length, + window=config.window, + n_mels=config.n_mels, + fmin=config.fmin, + fmax=config.fmax) + + # process for the 3 sections + if train_wav_files: + process_sentences( + config, + train_wav_files, + sentences, + train_dump_dir, + mel_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if dev_wav_files: + process_sentences( + config, + dev_wav_files, + sentences, + dev_dump_dir, + mel_extractor, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + if test_wav_files: + process_sentences( + config, + test_wav_files, + sentences, + test_dump_dir, + mel_extractor, + nprocs=args.num_cpu, + cut_sil=args.cut_sil, + spk_emb_dir=spk_emb_dir) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/new_tacotron2/train.py b/paddlespeech/t2s/exps/new_tacotron2/train.py new file mode 100644 index 0000000000000000000000000000000000000000..20f73f0cedfc7cbea44f952c703434f4e3e0cd60 --- /dev/null +++ b/paddlespeech/t2s/exps/new_tacotron2/train.py @@ -0,0 +1,190 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import os +import shutil +from pathlib import Path + +import jsonlines +import numpy as np +import paddle +import yaml +from paddle import DataParallel +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.am_batch_fn import tacotron2_single_spk_batch_fn +from paddlespeech.t2s.datasets.data_table import DataTable +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2 +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Evaluator +from paddlespeech.t2s.models.new_tacotron2 import Tacotron2Updater +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.optimizer import build_optimizers +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer + + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") + else: + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + + # construct dataset for training and validation + with jsonlines.open(args.train_metadata, 'r') as reader: + train_metadata = list(reader) + train_dataset = DataTable( + data=train_metadata, + fields=[ + "text", + "text_lengths", + "speech", + "speech_lengths", + ], + converters={ + "speech": np.load, + }, ) + with jsonlines.open(args.dev_metadata, 'r') as reader: + dev_metadata = list(reader) + dev_dataset = DataTable( + data=dev_metadata, + fields=[ + "text", + "text_lengths", + "speech", + "speech_lengths", + ], + converters={ + "speech": np.load, + }, ) + + # collate function and dataloader + train_sampler = DistributedBatchSampler( + train_dataset, + batch_size=config.batch_size, + shuffle=True, + drop_last=True) + + print("samplers done!") + + train_dataloader = DataLoader( + train_dataset, + batch_sampler=train_sampler, + collate_fn=tacotron2_single_spk_batch_fn, + num_workers=config.num_workers) + + dev_dataloader = DataLoader( + dev_dataset, + shuffle=False, + drop_last=False, + batch_size=config.batch_size, + collate_fn=tacotron2_single_spk_batch_fn, + num_workers=config.num_workers) + print("dataloaders done!") + + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + odim = config.n_mels + model = Tacotron2(idim=vocab_size, odim=odim, **config["model"]) + if world_size > 1: + model = DataParallel(model) + print("model done!") + + optimizer = build_optimizers(model, **config["optimizer"]) + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = Tacotron2Updater( + model=model, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir, + **config["updater"]) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = Tacotron2Evaluator( + model, dev_dataloader, output_dir=output_dir, **config["updater"]) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + # print(trainer.extensions) + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a Tacotron2 model.") + parser.add_argument("--config", type=str, help="tacotron2 config file.") + parser.add_argument("--train-metadata", type=str, help="training data.") + parser.add_argument("--dev-metadata", type=str, help="dev data.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + parser.add_argument( + "--phones-dict", type=str, default=None, help="phone vocabulary file.") + + args = parser.parse_args() + + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index f54774704a86d34cf00a8d01ac827ad4bfc84d80..02bfcb15d6f4b6e028314d24c21483ee53f5b183 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -36,6 +36,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -91,6 +95,8 @@ def evaluate(args): print("spk_num:", spk_num) elif am_name == 'speedyspeech': fields = ["utt_id", "phones", "tones"] + elif am_name == 'tacotron2': + fields = ["utt_id", "text"] test_dataset = DataTable(data=test_metadata, fields=fields) @@ -117,6 +123,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -168,6 +176,9 @@ def evaluate(args): phone_ids = paddle.to_tensor(datum["phones"]) tone_ids = paddle.to_tensor(datum["tones"]) mel = am_inference(phone_ids, tone_ids) + elif am_name == 'tacotron2': + phone_ids = paddle.to_tensor(datum["text"]) + mel = am_inference(phone_ids) # vocoder wav = voc_inference(mel) sf.write( @@ -188,7 +199,7 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 9b503213a9b8d23d1ccdaf6849b5cc2c3b2ff96a..9f58579febf206e8ce3ecbeffb8d60ffa411c1bd 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -38,6 +38,10 @@ model_alias = { "paddlespeech.t2s.models.fastspeech2:FastSpeech2", "fastspeech2_inference": "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.new_tacotron2:Tacotron2Inference", # voc "pwgan": "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", @@ -126,6 +130,8 @@ def evaluate(args): elif am_name == 'speedyspeech': am = am_class( vocab_size=vocab_size, tone_size=tone_size, **am_config["model"]) + elif am_name == 'tacotron2': + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) am.eval() @@ -237,6 +243,8 @@ def evaluate(args): elif am_name == 'speedyspeech': part_tone_ids = tone_ids[i] mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': + mel = am_inference(part_phone_ids) # vocoder wav = voc_inference(mel) if flags == 0: @@ -262,7 +270,7 @@ def main(): default='fastspeech2_csmsc', choices=[ 'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech', - 'fastspeech2_aishell3', 'fastspeech2_vctk' + 'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc' ], help='Choose acoustic model type of tts task.') parser.add_argument( diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py index f268a4e3359ecbee3a3b478b7cb94c31b145487e..65227374ed7550a219665aa559611747a3bc7f8c 100644 --- a/paddlespeech/t2s/models/__init__.py +++ b/paddlespeech/t2s/models/__init__.py @@ -14,6 +14,7 @@ from .fastspeech2 import * from .hifigan import * from .melgan import * +from .new_tacotron2 import * from .parallel_wavegan import * from .speedyspeech import * from .tacotron2 import * diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index dc136ffdad3ecac51e385f678b9c551703766f49..3e952c2043af4aded6e41738a2b94c8ed664cbf5 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -556,8 +556,7 @@ class FastSpeech2(nn.Layer): tone_id=tone_id) # modify mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py index 0dabf934ceb56b71f950b5179ba4c5e065faf499..92aa9dfc7730beb377aa36333a55b9133f378b0f 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator @@ -28,20 +32,17 @@ logger.setLevel(logging.INFO) class FastSpeech2Updater(StandardUpdater): def __init__(self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None): + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -107,14 +108,12 @@ class FastSpeech2Updater(StandardUpdater): class FastSpeech2Evaluator(StandardEvaluator): def __init__(self, - model, - dataloader, - use_masking=False, - use_weighted_masking=False, - output_dir=None): + model: Layer, + dataloader: DataLoader, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -123,8 +122,7 @@ class FastSpeech2Evaluator(StandardEvaluator): self.msg = "" self.criterion = FastSpeech2Loss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking) + use_masking=use_masking, use_weighted_masking=use_weighted_masking) def evaluate_core(self, batch): self.msg = "Evaluate: " diff --git a/paddlespeech/t2s/models/new_tacotron2/__init__.py b/paddlespeech/t2s/models/new_tacotron2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea63257c80d10cf16f34b027ad190edc15bfc815 --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .tacotron2 import * +from .tacotron2_updater import * diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py new file mode 100644 index 0000000000000000000000000000000000000000..6a6d107356c43fd0ba1e7568287e9cfd4e9e0a03 --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2.py @@ -0,0 +1,500 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tacotron 2 related modules for paddle""" +import logging +from typing import Dict +from typing import Optional +from typing import Tuple + +import paddle +import paddle.nn.functional as F +from paddle import nn +from typeguard import check_argument_types + +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.nets_utils import make_pad_mask +from paddlespeech.t2s.modules.tacotron2.attentions import AttForward +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA +from paddlespeech.t2s.modules.tacotron2.attentions import AttLoc +from paddlespeech.t2s.modules.tacotron2.decoder import Decoder +from paddlespeech.t2s.modules.tacotron2.encoder import Encoder + + +class Tacotron2(nn.Layer): + """Tacotron2 module for end-to-end text-to-speech. + + This is a module of Spectrogram prediction network in Tacotron2 described + in `Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_, + which converts the sequence of characters into the sequence of Mel-filterbanks. + + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + + """ + + def __init__( + self, + # network structure related + idim: int, + odim: int, + embed_dim: int=512, + elayers: int=1, + eunits: int=512, + econv_layers: int=3, + econv_chans: int=512, + econv_filts: int=5, + atype: str="location", + adim: int=512, + aconv_chans: int=32, + aconv_filts: int=15, + cumulate_att_w: bool=True, + dlayers: int=2, + dunits: int=1024, + prenet_layers: int=2, + prenet_units: int=256, + postnet_layers: int=5, + postnet_chans: int=512, + postnet_filts: int=5, + output_activation: str=None, + use_batch_norm: bool=True, + use_concate: bool=True, + use_residual: bool=False, + reduction_factor: int=1, + # extra embedding related + spk_num: Optional[int]=None, + lang_num: Optional[int]=None, + spk_embed_dim: Optional[int]=None, + spk_embed_integration_type: str="concat", + dropout_rate: float=0.5, + zoneout_rate: float=0.1, + # training related + init_type: str="xavier_uniform", ): + """Initialize Tacotron2 module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + embed_dim : int + Dimension of the token embedding. + elayers : int + Number of encoder blstm layers. + eunits : int + Number of encoder blstm units. + econv_layers : int + Number of encoder conv layers. + econv_filts : int + Number of encoder conv filter size. + econv_chans : int + Number of encoder conv filter channels. + dlayers : int + Number of decoder lstm layers. + dunits : int + Number of decoder lstm units. + prenet_layers : int + Number of prenet layers. + prenet_units : int + Number of prenet units. + postnet_layers : int + Number of postnet layers. + postnet_filts : int + Number of postnet filter size. + postnet_chans : int + Number of postnet filter channels. + output_activation : str + Name of activation function for outputs. + adim : int + Number of dimension of mlp in attention. + aconv_chans : int + Number of attention conv filter channels. + aconv_filts : int + Number of attention conv filter size. + cumulate_att_w : bool + Whether to cumulate previous attention weight. + use_batch_norm : bool + Whether to use batch normalization. + use_concate : bool + Whether to concat enc outputs w/ dec lstm outputs. + reduction_factor : int + Reduction factor. + spk_num : Optional[int] + Number of speakers. If set to > 1, assume that the + sids will be provided as the input and use sid embedding layer. + lang_num : Optional[int] + Number of languages. If set to > 1, assume that the + lids will be provided as the input and use sid embedding layer. + spk_embed_dim : Optional[int] + Speaker embedding dimension. If set to > 0, + assume that spk_emb will be provided as the input. + spk_embed_integration_type : str + How to integrate speaker embedding. + dropout_rate : float + Dropout rate. + zoneout_rate : float + Zoneout rate. + """ + assert check_argument_types() + super().__init__() + + # store hyperparameters + self.idim = idim + self.odim = odim + self.eos = idim - 1 + self.cumulate_att_w = cumulate_att_w + self.reduction_factor = reduction_factor + + # define activation function for the final output + if output_activation is None: + self.output_activation_fn = None + elif hasattr(F, output_activation): + self.output_activation_fn = getattr(F, output_activation) + else: + raise ValueError(f"there is no such an activation function. " + f"({output_activation})") + + # set padding idx + padding_idx = 0 + self.padding_idx = padding_idx + + # initialize parameters + initialize(self, init_type) + + # define network modules + self.enc = Encoder( + idim=idim, + embed_dim=embed_dim, + elayers=elayers, + eunits=eunits, + econv_layers=econv_layers, + econv_chans=econv_chans, + econv_filts=econv_filts, + use_batch_norm=use_batch_norm, + use_residual=use_residual, + dropout_rate=dropout_rate, + padding_idx=padding_idx, ) + + self.spk_num = None + if spk_num is not None and spk_num > 1: + self.spk_num = spk_num + self.sid_emb = nn.Embedding(spk_num, eunits) + self.lang_num = None + if lang_num is not None and lang_num > 1: + self.lang_num = lang_num + self.lid_emb = nn.Embedding(lang_num, eunits) + + self.spk_embed_dim = None + if spk_embed_dim is not None and spk_embed_dim > 0: + self.spk_embed_dim = spk_embed_dim + self.spk_embed_integration_type = spk_embed_integration_type + if self.spk_embed_dim is None: + dec_idim = eunits + elif self.spk_embed_integration_type == "concat": + dec_idim = eunits + spk_embed_dim + elif self.spk_embed_integration_type == "add": + dec_idim = eunits + self.projection = nn.Linear(self.spk_embed_dim, eunits) + else: + raise ValueError(f"{spk_embed_integration_type} is not supported.") + + if atype == "location": + att = AttLoc(dec_idim, dunits, adim, aconv_chans, aconv_filts) + elif atype == "forward": + att = AttForward(dec_idim, dunits, adim, aconv_chans, aconv_filts) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + elif atype == "forward_ta": + att = AttForwardTA(dec_idim, dunits, adim, aconv_chans, aconv_filts, + odim) + if self.cumulate_att_w: + logging.warning("cumulation of attention weights is disabled " + "in forward attention.") + self.cumulate_att_w = False + else: + raise NotImplementedError("Support only location or forward") + self.dec = Decoder( + idim=dec_idim, + odim=odim, + att=att, + dlayers=dlayers, + dunits=dunits, + prenet_layers=prenet_layers, + prenet_units=prenet_units, + postnet_layers=postnet_layers, + postnet_chans=postnet_chans, + postnet_filts=postnet_filts, + output_activation_fn=self.output_activation_fn, + cumulate_att_w=self.cumulate_att_w, + use_batch_norm=use_batch_norm, + use_concate=use_concate, + dropout_rate=dropout_rate, + zoneout_rate=zoneout_rate, + reduction_factor=reduction_factor, ) + + nn.initializer.set_global_initializer(None) + + def forward( + self, + text: paddle.Tensor, + text_lengths: paddle.Tensor, + speech: paddle.Tensor, + speech_lengths: paddle.Tensor, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]: + """Calculate forward propagation. + + Parameters + ---------- + text : Tensor(int64) + Batch of padded character ids (B, T_text). + text_lengths : Tensor(int64) + Batch of lengths of each input batch (B,). + speech : Tensor + Batch of padded target features (B, T_feats, odim). + speech_lengths : Tensor(int64) + Batch of the lengths of each target (B,). + spk_emb : Optional[Tensor] + Batch of speaker embeddings (B, spk_embed_dim). + spk_id : Optional[Tensor] + Batch of speaker IDs (B, 1). + lang_id : Optional[Tensor] + Batch of language IDs (B, 1). + + Returns + ---------- + Tensor + Loss scalar value. + Dict + Statistics to be monitored. + Tensor + Weight value if not joint training else model outputs. + + """ + text = text[:, :text_lengths.max()] + speech = speech[:, :speech_lengths.max()] + + batch_size = paddle.shape(text)[0] + + # Add eos at the last of sequence + xs = F.pad(text, [0, 0, 0, 1], "constant", self.padding_idx) + for i, l in enumerate(text_lengths): + xs[i, l] = self.eos + ilens = text_lengths + 1 + + ys = speech + olens = speech_lengths + + # make labels for stop prediction + stop_labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) + + # calculate tacotron2 outputs + after_outs, before_outs, logits, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + # modify mod part of groundtruth + if self.reduction_factor > 1: + assert olens.ge(self.reduction_factor).all( + ), "Output length must be greater than or equal to reduction factor." + olens = olens - olens % self.reduction_factor + max_out = max(olens) + ys = ys[:, :max_out] + stop_labels = stop_labels[:, :max_out] + stop_labels = paddle.scatter(stop_labels, 1, + (olens - 1).unsqueeze(1), 1.0) + olens_in = olens // self.reduction_factor + else: + olens_in = olens + return after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in + + def _forward( + self, + xs: paddle.Tensor, + ilens: paddle.Tensor, + ys: paddle.Tensor, + olens: paddle.Tensor, + spk_emb: paddle.Tensor, + spk_id: paddle.Tensor, + lang_id: paddle.Tensor, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: + + hs, hlens = self.enc(xs, ilens) + if self.spk_num is not None: + sid_embs = self.sid_emb(spk_id.reshape([-1])) + hs = hs + sid_embs.unsqueeze(1) + if self.lang_num is not None: + lid_embs = self.lid_emb(lang_id.reshape([-1])) + hs = hs + lid_embs.unsqueeze(1) + if self.spk_embed_dim is not None: + hs = self._integrate_with_spk_embed(hs, spk_emb) + + return self.dec(hs, hlens, ys) + + def inference( + self, + text: paddle.Tensor, + speech: Optional[paddle.Tensor]=None, + spk_emb: Optional[paddle.Tensor]=None, + spk_id: Optional[paddle.Tensor]=None, + lang_id: Optional[paddle.Tensor]=None, + threshold: float=0.5, + minlenratio: float=0.0, + maxlenratio: float=10.0, + use_att_constraint: bool=False, + backward_window: int=1, + forward_window: int=3, + use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: + """Generate the sequence of features given the sequences of characters. + + Parameters + ---------- + text Tensor(int64) + Input sequence of characters (T_text,). + speech : Optional[Tensor] + Feature sequence to extract style (N, idim). + spk_emb : ptional[Tensor] + Speaker embedding (spk_embed_dim,). + spk_id : Optional[Tensor] + Speaker ID (1,). + lang_id : Optional[Tensor] + Language ID (1,). + threshold : float + Threshold in inference. + minlenratio : float + Minimum length ratio in inference. + maxlenratio : float + Maximum length ratio in inference. + use_att_constraint : bool + Whether to apply attention constraint. + backward_window : int + Backward window in attention constraint. + forward_window : int + Forward window in attention constraint. + use_teacher_forcing : bool + Whether to use teacher forcing. + + Return + ---------- + Dict[str, Tensor] + Output dict including the following items: + * feat_gen (Tensor): Output sequence of features (T_feats, odim). + * prob (Tensor): Output sequence of stop probabilities (T_feats,). + * att_w (Tensor): Attention weights (T_feats, T). + + """ + x = text + y = speech + + # add eos at the last of sequence + x = F.pad(x, [0, 1], "constant", self.eos) + + # inference with teacher forcing + if use_teacher_forcing: + assert speech is not None, "speech must be provided with teacher forcing." + + xs, ys = x.unsqueeze(0), y.unsqueeze(0) + spk_emb = None if spk_emb is None else spk_emb.unsqueeze(0) + ilens = paddle.shape(xs)[1] + olens = paddle.shape(ys)[1] + outs, _, _, att_ws = self._forward( + xs=xs, + ilens=ilens, + ys=ys, + olens=olens, + spk_emb=spk_emb, + spk_id=spk_id, + lang_id=lang_id, ) + + return dict(feat_gen=outs[0], att_w=att_ws[0]) + + # inference + h = self.enc.inference(x) + if self.spk_num is not None: + sid_emb = self.sid_emb(spk_id.reshape([-1])) + h = h + sid_emb + if self.lang_num is not None: + lid_emb = self.lid_emb(lang_id.reshape([-1])) + h = h + lid_emb + if self.spk_embed_dim is not None: + hs, spk_emb = h.unsqueeze(0), spk_emb.unsqueeze(0) + h = self._integrate_with_spk_embed(hs, spk_emb)[0] + out, prob, att_w = self.dec.inference( + h, + threshold=threshold, + minlenratio=minlenratio, + maxlenratio=maxlenratio, + use_att_constraint=use_att_constraint, + backward_window=backward_window, + forward_window=forward_window, ) + + return dict(feat_gen=out, prob=prob, att_w=att_w) + + def _integrate_with_spk_embed(self, + hs: paddle.Tensor, + spk_emb: paddle.Tensor) -> paddle.Tensor: + """Integrate speaker embedding with hidden states. + + Parameters + ---------- + hs : Tensor + Batch of hidden state sequences (B, Tmax, eunits). + spk_emb : Tensor + Batch of speaker embeddings (B, spk_embed_dim). + + Returns + ---------- + Tensor + Batch of integrated hidden state sequences (B, Tmax, eunits) if + integration_type is "add" else (B, Tmax, eunits + spk_embed_dim). + + """ + if self.spk_embed_integration_type == "add": + # apply projection and then add to hidden states + spk_emb = self.projection(F.normalize(spk_emb)) + hs = hs + spk_emb.unsqueeze(1) + elif self.spk_embed_integration_type == "concat": + # concat hidden states with spk embeds + spk_emb = F.normalize(spk_emb).unsqueeze(1).expand( + -1, paddle.shape(hs)[1], -1) + hs = paddle.concat([hs, spk_emb], axis=-1) + else: + raise NotImplementedError("support only add or concat.") + + return hs + + +class Tacotron2Inference(nn.Layer): + def __init__(self, normalizer, model): + super().__init__() + self.normalizer = normalizer + self.acoustic_model = model + + def forward(self, text, spk_id=None, spk_emb=None): + out = self.acoustic_model.inference( + text, spk_id=spk_id, spk_emb=spk_emb) + normalized_mel = out["feat_gen"] + logmel = self.normalizer.inverse(normalized_mel) + return logmel diff --git a/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..09e6827d04ec97da2d37cc1393480fda3abc234e --- /dev/null +++ b/paddlespeech/t2s/models/new_tacotron2/tacotron2_updater.py @@ -0,0 +1,219 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path + +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer + +from paddlespeech.t2s.modules.losses import GuidedAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class Tacotron2Updater(StandardUpdater): + def __init__(self, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, + init_state=None, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir: Path=None): + super().__init__(model, optimizer, dataloader, init_state=None) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) + loss = loss + attn_loss + + optimizer = self.optimizer + optimizer.clear_grad() + loss.backward() + optimizer.step() + + report("train/l1_loss", float(l1_loss)) + report("train/mse_loss", float(mse_loss)) + report("train/bce_loss", float(bce_loss)) + report("train/attn_loss", float(attn_loss)) + report("train/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class Tacotron2Evaluator(StandardEvaluator): + def __init__(self, + model: Layer, + dataloader: DataLoader, + use_masking: bool=True, + use_weighted_masking: bool=False, + bce_pos_weight: float=5.0, + loss_type: str="L1+L2", + use_guided_attn_loss: bool=True, + guided_attn_loss_sigma: float=0.4, + guided_attn_loss_lambda: float=1.0, + output_dir=None): + super().__init__(model, dataloader) + + self.loss_type = loss_type + self.use_guided_attn_loss = use_guided_attn_loss + + self.taco2_loss = Tacotron2Loss( + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight, ) + if self.use_guided_attn_loss: + self.attn_loss = GuidedAttentionLoss( + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + # spk_id!=None in multiple spk fastspeech2 + spk_id = batch["spk_id"] if "spk_id" in batch else None + spk_emb = batch["spk_emb"] if "spk_emb" in batch else None + if spk_emb is not None: + spk_id = None + + after_outs, before_outs, logits, ys, stop_labels, olens, att_ws, olens_in = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + spk_id=spk_id, + spk_emb=spk_emb) + + # calculate taco2 loss + l1_loss, mse_loss, bce_loss = self.taco2_loss( + after_outs=after_outs, + before_outs=before_outs, + logits=logits, + ys=ys, + stop_labels=stop_labels, + olens=olens) + + if self.loss_type == "L1+L2": + loss = l1_loss + mse_loss + bce_loss + elif self.loss_type == "L1": + loss = l1_loss + bce_loss + elif self.loss_type == "L2": + loss = mse_loss + bce_loss + else: + raise ValueError(f"unknown --loss-type {self.loss_type}") + + # calculate attention loss + if self.use_guided_attn_loss: + # NOTE: length of output for auto-regressive + # input will be changed when r > 1 + attn_loss = self.attn_loss( + att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in) + loss = loss + attn_loss + + report("eval/l1_loss", float(l1_loss)) + report("eval/mse_loss", float(mse_loss)) + report("eval/bce_loss", float(bce_loss)) + report("eval/attn_loss", float(attn_loss)) + report("eval/loss", float(loss)) + + losses_dict["l1_loss"] = float(l1_loss) + losses_dict["mse_loss"] = float(mse_loss) + losses_dict["bce_loss"] = float(bce_loss) + losses_dict["attn_loss"] = float(attn_loss) + losses_dict["loss"] = float(loss) + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index ee45cdc85dc6cd0c078f6699468aaa442c79a38d..e30a3fe1a5947c7046501ef26fe069656c1fcb31 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path import paddle from paddle import distributed as dist from paddle.fluid.layers import huber_loss +from paddle.io import DataLoader from paddle.nn import functional as F +from paddle.nn import Layer +from paddle.optimizer import Optimizer from paddlespeech.t2s.modules.losses import masked_l1_loss from paddlespeech.t2s.modules.losses import ssim @@ -33,11 +37,11 @@ logger.setLevel(logging.INFO) class SpeedySpeechUpdater(StandardUpdater): def __init__(self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - output_dir=None): + output_dir: Path=None): super().__init__(model, optimizer, dataloader, init_state=None) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) @@ -103,7 +107,10 @@ class SpeedySpeechUpdater(StandardUpdater): class SpeedySpeechEvaluator(StandardEvaluator): - def __init__(self, model, dataloader, output_dir=None): + def __init__(self, + model: Layer, + dataloader: DataLoader, + output_dir: Path=None): super().__init__(model, dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py index ae6d736559384b8951f02c09dbc8bf987625e1f5..4babe283623faa0c62902ddb629c684a529111ea 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py @@ -433,12 +433,10 @@ class TransformerTTS(nn.Layer): olens = paddle.cast(speech_lengths, 'int64') # make labels for stop prediction - labels = make_pad_mask(olens - 1) - labels = numpy.pad( - labels.numpy(), ((0, 0), (0, 1)), 'constant', constant_values=1.0) - labels = paddle.to_tensor(labels) - labels = paddle.cast(labels, dtype="float32") - # labels = F.pad(labels, [0, 1], "constant", 1.0) + stop_labels = make_pad_mask(olens - 1) + # bool 类型无法切片 + stop_labels = paddle.cast(stop_labels, dtype='float32') + stop_labels = F.pad(stop_labels, [0, 0, 0, 1], "constant", 1.0) # calculate transformer outputs after_outs, before_outs, logits = self._forward(xs, ilens, ys, olens, @@ -447,12 +445,15 @@ class TransformerTTS(nn.Layer): # modifiy mod part of groundtruth if self.reduction_factor > 1: - olens = paddle.to_tensor( - [olen - olen % self.reduction_factor for olen in olens.numpy()]) + olens = olens - olens % self.reduction_factor max_olen = max(olens) ys = ys[:, :max_olen] - labels = labels[:, :max_olen] - labels[:, -1] = 1.0 # make sure at least one frame has 1 + stop_labels = stop_labels[:, :max_olen] + stop_labels[:, -1] = 1.0 # make sure at least one frame has 1 + olens_in = olens // self.reduction_factor + else: + olens_in = olens + need_dict = {} need_dict['encoder'] = self.encoder need_dict['decoder'] = self.decoder @@ -462,7 +463,7 @@ class TransformerTTS(nn.Layer): 'num_layers_applied_guided_attn'] = self.num_layers_applied_guided_attn need_dict['use_scaled_pos_enc'] = self.use_scaled_pos_enc - return after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict + return after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict def _forward( self, @@ -488,8 +489,7 @@ class TransformerTTS(nn.Layer): # thin out frames for reduction factor (B, Lmax, odim) -> (B, Lmax//r, odim) if self.reduction_factor > 1: ys_in = ys[:, self.reduction_factor - 1::self.reduction_factor] - olens_in = olens.new( - [olen // self.reduction_factor for olen in olens]) + olens_in = olens // self.reduction_factor else: ys_in, olens_in = ys, olens @@ -769,318 +769,3 @@ class TransformerTTSInference(nn.Layer): normalized_mel = self.acoustic_model.inference(text)[0] logmel = self.normalizer.inverse(normalized_mel) return logmel - - -class TransformerTTSLoss(nn.Layer): - """Loss function module for Tacotron2.""" - - def __init__(self, - use_masking=True, - use_weighted_masking=False, - bce_pos_weight=5.0): - """Initialize Tactoron2 loss module. - - Parameters - ---------- - use_masking : bool - Whether to apply masking for padded part in loss calculation. - use_weighted_masking : bool - Whether to apply weighted masking in loss calculation. - bce_pos_weight : float - Weight of positive sample of stop token. - - """ - super().__init__() - assert (use_masking != use_weighted_masking) or not use_masking - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - - # define criterions - reduction = "none" if self.use_weighted_masking else "mean" - self.l1_criterion = nn.L1Loss(reduction=reduction) - self.mse_criterion = nn.MSELoss(reduction=reduction) - self.bce_criterion = nn.BCEWithLogitsLoss( - reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) - - def forward(self, after_outs, before_outs, logits, ys, labels, olens): - """Calculate forward propagation. - - Parameters - ---------- - after_outs : Tensor - Batch of outputs after postnets (B, Lmax, odim). - before_outs : Tensor - Batch of outputs before postnets (B, Lmax, odim). - logits : Tensor - Batch of stop logits (B, Lmax). - ys : Tensor - Batch of padded target features (B, Lmax, odim). - labels : LongTensor - Batch of the sequences of stop token labels (B, Lmax). - olens : LongTensor - Batch of the lengths of each target (B,). - - Returns - ---------- - Tensor - L1 loss value. - Tensor - Mean square error loss value. - Tensor - Binary cross entropy loss value. - - """ - # make mask and apply it - if self.use_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - ys = ys.masked_select(masks.broadcast_to(ys.shape)) - after_outs = after_outs.masked_select( - masks.broadcast_to(after_outs.shape)) - before_outs = before_outs.masked_select( - masks.broadcast_to(before_outs.shape)) - # Operator slice does not have kernel for data_type[bool] - tmp_masks = paddle.cast(masks, dtype='int64') - tmp_masks = tmp_masks[:, :, 0] - tmp_masks = paddle.cast(tmp_masks, dtype='bool') - labels = labels.masked_select(tmp_masks.broadcast_to(labels.shape)) - logits = logits.masked_select(tmp_masks.broadcast_to(logits.shape)) - - # calculate loss - l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( - before_outs, ys) - mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( - before_outs, ys) - bce_loss = self.bce_criterion(logits, labels) - - # make weighted mask and apply it - if self.use_weighted_masking: - masks = make_non_pad_mask(olens).unsqueeze(-1) - weights = masks.float() / masks.sum(dim=1, keepdim=True).float() - out_weights = weights.div(ys.shape[0] * ys.shape[2]) - logit_weights = weights.div(ys.shape[0]) - - # apply weight - l1_loss = l1_loss.multiply(out_weights) - l1_loss = l1_loss.masked_select( - masks.broadcast_to(l1_loss.shape)).sum() - - mse_loss = mse_loss.multiply(out_weights) - mse_loss = mse_loss.masked_select( - masks.broadcast_to(mse_loss.shape)).sum() - - bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) - bce_loss = bce_loss.masked_select( - masks.squeeze(-1).broadcast_to(bce_loss.shape)).sum() - - return l1_loss, mse_loss, bce_loss - - -class GuidedAttentionLoss(nn.Layer): - """Guided attention loss function module. - - This module calculates the guided attention loss described - in `Efficiently Trainable Text-to-Speech System Based - on Deep Convolutional Networks with Guided Attention`_, - which forces the attention to be diagonal. - - .. _`Efficiently Trainable Text-to-Speech System - Based on Deep Convolutional Networks with Guided Attention`: - https://arxiv.org/abs/1710.08969 - - """ - - def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): - """Initialize guided attention loss module. - - Parameters - ---------- - sigma : float, optional - Standard deviation to control how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - super(GuidedAttentionLoss, self).__init__() - self.sigma = sigma - self.alpha = alpha - self.reset_always = reset_always - self.guided_attn_masks = None - self.masks = None - - def _reset_masks(self): - self.guided_attn_masks = None - self.masks = None - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of attention weights (B, T_max_out, T_max_in). - ilens : LongTensor - Batch of input lenghts (B,). - olens : LongTensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = self._make_guided_attention_masks(ilens, - olens) - if self.masks is None: - self.masks = self._make_masks(ilens, olens) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - return self.alpha * loss - - def _make_guided_attention_masks(self, ilens, olens): - n_batches = len(ilens) - max_ilen = max(ilens) - max_olen = max(olens) - guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) - - for idx, (ilen, olen) in enumerate(zip(ilens, olens)): - - ilen = int(ilen) - olen = int(olen) - guided_attn_masks[idx, :olen, : - ilen] = self._make_guided_attention_mask( - ilen, olen, self.sigma) - return guided_attn_masks - - @staticmethod - def _make_guided_attention_mask(ilen, olen, sigma): - """Make guided attention mask. - - Examples - ---------- - >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) - >>> guided_attn_mask.shape - [5, 5] - >>> guided_attn_mask - tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], - [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], - [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], - [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], - [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) - >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) - >>> guided_attn_mask.shape - [6, 3] - >>> guided_attn_mask - tensor([[0.0000, 0.2934, 0.7506], - [0.0831, 0.0831, 0.5422], - [0.2934, 0.0000, 0.2934], - [0.5422, 0.0831, 0.0831], - [0.7506, 0.2934, 0.0000], - [0.8858, 0.5422, 0.0831]]) - - """ - grid_x, grid_y = paddle.meshgrid( - paddle.arange(olen), paddle.arange(ilen)) - grid_x = grid_x.cast(dtype=paddle.float32) - grid_y = grid_y.cast(dtype=paddle.float32) - return 1.0 - paddle.exp(-( - (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) - - @staticmethod - def _make_masks(ilens, olens): - """Make masks indicating non-padded part. - - Parameters - ---------- - ilens (LongTensor or List): Batch of lengths (B,). - olens (LongTensor or List): Batch of lengths (B,). - - Returns - ---------- - Tensor - Mask tensor indicating non-padded part. - - Examples - ---------- - >>> ilens, olens = [5, 2], [8, 5] - >>> _make_mask(ilens, olens) - tensor([[[1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1], - [1, 1, 1, 1, 1]], - - [[1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [1, 1, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]]], dtype=paddle.uint8) - - """ - # (B, T_in) - in_masks = make_non_pad_mask(ilens) - # (B, T_out) - out_masks = make_non_pad_mask(olens) - # (B, T_out, T_in) - - return paddle.logical_and( - out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) - - -class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): - """Guided attention loss function module for multi head attention. - - Parameters - ---------- - sigma : float, optional - Standard deviation to controlGuidedAttentionLoss - how close attention to a diagonal. - alpha : float, optional - Scaling coefficient (lambda). - reset_always : bool, optional - Whether to always reset masks. - - """ - - def forward(self, att_ws, ilens, olens): - """Calculate forward propagation. - - Parameters - ---------- - att_ws : Tensor - Batch of multi head attention weights (B, H, T_max_out, T_max_in). - ilens : Tensor - Batch of input lenghts (B,). - olens : Tensor - Batch of output lenghts (B,). - - Returns - ---------- - Tensor - Guided attention loss value. - - """ - if self.guided_attn_masks is None: - self.guided_attn_masks = ( - self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) - if self.masks is None: - self.masks = self._make_masks(ilens, olens).unsqueeze(1) - losses = self.guided_attn_masks * att_ws - loss = paddle.mean( - losses.masked_select(self.masks.broadcast_to(losses.shape))) - if self.reset_always: - self._reset_masks() - - return self.alpha * loss diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py index f16cf4dd9e76e7992197aa486352ac08c25198e1..dff908e05bf01d181352fc6ebd28113f0a106923 100644 --- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py +++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py @@ -12,13 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +from pathlib import Path from typing import Sequence import paddle from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer -from paddlespeech.t2s.models.transformer_tts import GuidedMultiHeadAttentionLoss -from paddlespeech.t2s.models.transformer_tts import TransformerTTSLoss +from paddlespeech.t2s.modules.losses import GuidedMultiHeadAttentionLoss +from paddlespeech.t2s.modules.losses import Tacotron2Loss as TransformerTTSLoss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater @@ -32,38 +36,34 @@ logger.setLevel(logging.INFO) class TransformerTTSUpdater(StandardUpdater): def __init__( self, - model, - optimizer, - dataloader, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None, - bce_pos_weight=5.0, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None, + bce_pos_weight: float=5.0, loss_type: str="L1", use_guided_attn_loss: bool=True, modules_applied_guided_attn: Sequence[str]=("encoder-decoder"), guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, optimizer, dataloader, init_state=None) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -75,7 +75,7 @@ class TransformerTTSUpdater(StandardUpdater): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -86,7 +86,7 @@ class TransformerTTSUpdater(StandardUpdater): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("train/bce_loss", float(bce_loss)) @@ -120,7 +120,10 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -137,7 +140,8 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("train/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -154,7 +158,10 @@ class TransformerTTSUpdater(StandardUpdater): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("train/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss @@ -182,37 +189,33 @@ class TransformerTTSUpdater(StandardUpdater): class TransformerTTSEvaluator(StandardEvaluator): def __init__( self, - model, - dataloader, + model: Layer, + dataloader: DataLoader, init_state=None, - use_masking=False, - use_weighted_masking=False, - output_dir=None, - bce_pos_weight=5.0, + use_masking: bool=False, + use_weighted_masking: bool=False, + output_dir: Path=None, + bce_pos_weight: float=5.0, loss_type: str="L1", use_guided_attn_loss: bool=True, modules_applied_guided_attn: Sequence[str]=("encoder-decoder"), guided_attn_loss_sigma: float=0.4, guided_attn_loss_lambda: float=1.0, ): super().__init__(model, dataloader) - self.use_masking = use_masking - self.use_weighted_masking = use_weighted_masking - self.bce_pos_weight = bce_pos_weight + self.loss_type = loss_type self.use_guided_attn_loss = use_guided_attn_loss - self.guided_attn_loss_sigma = guided_attn_loss_sigma - self.guided_attn_loss_lambda = guided_attn_loss_lambda self.modules_applied_guided_attn = modules_applied_guided_attn self.criterion = TransformerTTSLoss( - use_masking=self.use_masking, - use_weighted_masking=self.use_weighted_masking, - bce_pos_weight=self.bce_pos_weight) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, + bce_pos_weight=bce_pos_weight) if self.use_guided_attn_loss: self.attn_criterion = GuidedMultiHeadAttentionLoss( - sigma=self.guided_attn_loss_sigma, - alpha=self.guided_attn_loss_lambda, ) + sigma=guided_attn_loss_sigma, + alpha=guided_attn_loss_lambda, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) @@ -223,7 +226,7 @@ class TransformerTTSEvaluator(StandardEvaluator): def evaluate_core(self, batch): self.msg = "Evaluate: " losses_dict = {} - after_outs, before_outs, logits, ys, labels, olens, ilens, need_dict = self.model( + after_outs, before_outs, logits, ys, stop_labels, olens, olens_in, need_dict = self.model( text=batch["text"], text_lengths=batch["text_lengths"], speech=batch["speech"], @@ -234,7 +237,7 @@ class TransformerTTSEvaluator(StandardEvaluator): before_outs=before_outs, logits=logits, ys=ys, - labels=labels, + stop_labels=stop_labels, olens=olens) report("eval/bce_loss", float(bce_loss)) @@ -268,7 +271,10 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_in, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_attn_loss = self.attn_criterion(att_ws, ilens, ilens) + enc_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=batch["text_lengths"] + 1) loss = loss + enc_attn_loss report("train/enc_attn_loss", float(enc_attn_loss)) losses_dict["enc_attn_loss"] = float(enc_attn_loss) @@ -285,7 +291,8 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_out, T_out) att_ws = paddle.concat(att_ws, axis=1) - dec_attn_loss = self.attn_criterion(att_ws, olens, olens) + dec_attn_loss = self.attn_criterion( + att_ws=att_ws, ilens=olens_in, olens=olens_in) report("eval/dec_attn_loss", float(dec_attn_loss)) losses_dict["dec_attn_loss"] = float(dec_attn_loss) loss = loss + dec_attn_loss @@ -303,7 +310,10 @@ class TransformerTTSEvaluator(StandardEvaluator): break # (B, H*L, T_out, T_in) att_ws = paddle.concat(att_ws, axis=1) - enc_dec_attn_loss = self.attn_criterion(att_ws, ilens, olens) + enc_dec_attn_loss = self.attn_criterion( + att_ws=att_ws, + ilens=batch["text_lengths"] + 1, + olens=olens_in) report("eval/enc_dec_attn_loss", float(enc_dec_attn_loss)) losses_dict["enc_dec_attn_loss"] = float(enc_dec_attn_loss) loss = loss + enc_dec_attn_loss diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py index 569e96ada6f493ecba6d923ed8fa02cac7500fb8..3cc7a93cb5f74d50a2562f61b18b9f1e1bab6bf8 100644 --- a/paddlespeech/t2s/modules/losses.py +++ b/paddlespeech/t2s/modules/losses.py @@ -20,6 +20,314 @@ from paddle.fluid.layers import sequence_mask from paddle.nn import functional as F from scipy import signal +from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask + + +# Loss for new Tacotron2 +class GuidedAttentionLoss(nn.Layer): + """Guided attention loss function module. + + This module calculates the guided attention loss described + in `Efficiently Trainable Text-to-Speech System Based + on Deep Convolutional Networks with Guided Attention`_, + which forces the attention to be diagonal. + + .. _`Efficiently Trainable Text-to-Speech System + Based on Deep Convolutional Networks with Guided Attention`: + https://arxiv.org/abs/1710.08969 + + """ + + def __init__(self, sigma=0.4, alpha=1.0, reset_always=True): + """Initialize guided attention loss module. + + Parameters + ---------- + sigma : float, optional + Standard deviation to control how close attention to a diagonal. + alpha : float, optional + Scaling coefficient (lambda). + reset_always : bool, optional + Whether to always reset masks. + + """ + super().__init__() + self.sigma = sigma + self.alpha = alpha + self.reset_always = reset_always + self.guided_attn_masks = None + self.masks = None + + def _reset_masks(self): + self.guided_attn_masks = None + self.masks = None + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + + Parameters + ---------- + att_ws : Tensor + Batch of attention weights (B, T_max_out, T_max_in). + ilens : Tensor(int64) + Batch of input lenghts (B,). + olens : Tensor(int64) + Batch of output lenghts (B,). + + Returns + ---------- + Tensor + Guided attention loss value. + + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = self._make_guided_attention_masks(ilens, + olens) + if self.masks is None: + self.masks = self._make_masks(ilens, olens) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) + if self.reset_always: + self._reset_masks() + return self.alpha * loss + + def _make_guided_attention_masks(self, ilens, olens): + n_batches = len(ilens) + max_ilen = max(ilens) + max_olen = max(olens) + guided_attn_masks = paddle.zeros((n_batches, max_olen, max_ilen)) + + for idx, (ilen, olen) in enumerate(zip(ilens, olens)): + guided_attn_masks[idx, :olen, : + ilen] = self._make_guided_attention_mask( + ilen, olen, self.sigma) + return guided_attn_masks + + @staticmethod + def _make_guided_attention_mask(ilen, olen, sigma): + """Make guided attention mask. + + Examples + ---------- + >>> guided_attn_mask =_make_guided_attention(5, 5, 0.4) + >>> guided_attn_mask.shape + [5, 5] + >>> guided_attn_mask + tensor([[0.0000, 0.1175, 0.3935, 0.6753, 0.8647], + [0.1175, 0.0000, 0.1175, 0.3935, 0.6753], + [0.3935, 0.1175, 0.0000, 0.1175, 0.3935], + [0.6753, 0.3935, 0.1175, 0.0000, 0.1175], + [0.8647, 0.6753, 0.3935, 0.1175, 0.0000]]) + >>> guided_attn_mask =_make_guided_attention(3, 6, 0.4) + >>> guided_attn_mask.shape + [6, 3] + >>> guided_attn_mask + tensor([[0.0000, 0.2934, 0.7506], + [0.0831, 0.0831, 0.5422], + [0.2934, 0.0000, 0.2934], + [0.5422, 0.0831, 0.0831], + [0.7506, 0.2934, 0.0000], + [0.8858, 0.5422, 0.0831]]) + + """ + grid_x, grid_y = paddle.meshgrid( + paddle.arange(olen), paddle.arange(ilen)) + grid_x = grid_x.cast(dtype=paddle.float32) + grid_y = grid_y.cast(dtype=paddle.float32) + return 1.0 - paddle.exp(-( + (grid_y / ilen - grid_x / olen)**2) / (2 * (sigma**2))) + + @staticmethod + def _make_masks(ilens, olens): + """Make masks indicating non-padded part. + + Parameters + ---------- + ilens : Tensor(int64) or List + Batch of lengths (B,). + olens : Tensor(int64) or List + Batch of lengths (B,). + + Returns + ---------- + Tensor + Mask tensor indicating non-padded part. + + Examples + ---------- + >>> ilens, olens = [5, 2], [8, 5] + >>> _make_mask(ilens, olens) + tensor([[[1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1]], + + [[1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]], dtype=paddle.uint8) + + """ + # (B, T_in) + in_masks = make_non_pad_mask(ilens) + # (B, T_out) + out_masks = make_non_pad_mask(olens) + # (B, T_out, T_in) + + return paddle.logical_and( + out_masks.unsqueeze(-1), in_masks.unsqueeze(-2)) + + +class GuidedMultiHeadAttentionLoss(GuidedAttentionLoss): + """Guided attention loss function module for multi head attention. + + Parameters + ---------- + sigma : float, optional + Standard deviation to controlGuidedAttentionLoss + how close attention to a diagonal. + alpha : float, optional + Scaling coefficient (lambda). + reset_always : bool, optional + Whether to always reset masks. + + """ + + def forward(self, att_ws, ilens, olens): + """Calculate forward propagation. + + Parameters + ---------- + att_ws : Tensor + Batch of multi head attention weights (B, H, T_max_out, T_max_in). + ilens : Tensor + Batch of input lenghts (B,). + olens : Tensor + Batch of output lenghts (B,). + + Returns + ---------- + Tensor + Guided attention loss value. + + """ + if self.guided_attn_masks is None: + self.guided_attn_masks = ( + self._make_guided_attention_masks(ilens, olens).unsqueeze(1)) + if self.masks is None: + self.masks = self._make_masks(ilens, olens).unsqueeze(1) + losses = self.guided_attn_masks * att_ws + loss = paddle.mean( + losses.masked_select(self.masks.broadcast_to(losses.shape))) + if self.reset_always: + self._reset_masks() + + return self.alpha * loss + + +class Tacotron2Loss(nn.Layer): + """Loss function module for Tacotron2.""" + + def __init__(self, + use_masking=True, + use_weighted_masking=False, + bce_pos_weight=20.0): + """Initialize Tactoron2 loss module. + Parameters + ---------- + use_masking : bool + Whether to apply masking for padded part in loss calculation. + use_weighted_masking : bool + Whether to apply weighted masking in loss calculation. + bce_pos_weight : float + Weight of positive sample of stop token. + """ + super().__init__() + assert (use_masking != use_weighted_masking) or not use_masking + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + + # define criterions + reduction = "none" if self.use_weighted_masking else "mean" + self.l1_criterion = nn.L1Loss(reduction=reduction) + self.mse_criterion = nn.MSELoss(reduction=reduction) + self.bce_criterion = nn.BCEWithLogitsLoss( + reduction=reduction, pos_weight=paddle.to_tensor(bce_pos_weight)) + + def forward(self, after_outs, before_outs, logits, ys, stop_labels, olens): + """Calculate forward propagation. + Parameters + ---------- + after_outs : Tensor + Batch of outputs after postnets (B, Lmax, odim). + before_outs : Tensor + Batch of outputs before postnets (B, Lmax, odim). + logits : Tensor + Batch of stop logits (B, Lmax). + ys : Tensor + Batch of padded target features (B, Lmax, odim). + stop_labels : Tensor(int64) + Batch of the sequences of stop token labels (B, Lmax). + olens : Tensor(int64) + Batch of the lengths of each target (B,). + Returns + ---------- + Tensor + L1 loss value. + Tensor + Mean square error loss value. + Tensor + Binary cross entropy loss value. + """ + # make mask and apply it + if self.use_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + ys = ys.masked_select(masks.broadcast_to(ys.shape)) + after_outs = after_outs.masked_select( + masks.broadcast_to(after_outs.shape)) + before_outs = before_outs.masked_select( + masks.broadcast_to(before_outs.shape)) + stop_labels = stop_labels.masked_select( + masks[:, :, 0].broadcast_to(stop_labels.shape)) + logits = logits.masked_select( + masks[:, :, 0].broadcast_to(logits.shape)) + + # calculate loss + l1_loss = self.l1_criterion(after_outs, ys) + self.l1_criterion( + before_outs, ys) + mse_loss = self.mse_criterion(after_outs, ys) + self.mse_criterion( + before_outs, ys) + bce_loss = self.bce_criterion(logits, stop_labels) + + # make weighted mask and apply it + if self.use_weighted_masking: + masks = make_non_pad_mask(olens).unsqueeze(-1) + weights = masks.float() / masks.sum(axis=1, keepdim=True).float() + out_weights = weights.divide( + paddle.shape(ys)[0] * paddle.shape(ys)[2]) + logit_weights = weights.divide(paddle.shape(ys)[0]) + + # apply weight + l1_loss = l1_loss.multiply(out_weights) + l1_loss = l1_loss.masked_select(masks.broadcast_to(l1_loss)).sum() + mse_loss = mse_loss.multiply(out_weights) + mse_loss = mse_loss.masked_select( + masks.broadcast_to(mse_loss)).sum() + bce_loss = bce_loss.multiply(logit_weights.squeeze(-1)) + bce_loss = bce_loss.masked_select( + masks.squeeze(-1).broadcast_to(bce_loss)).sum() + + return l1_loss, mse_loss, bce_loss + # Loss for Tacotron2 def attention_guide(dec_lens, enc_lens, N, T, g, dtype=None): @@ -80,7 +388,7 @@ def stft(x, details. Defaults to "hann". center : bool, optional center (bool, optional): Whether to pad `x` to make that the - :math:`t \times hop\_length` at the center of :math:`t`-th frame. Default: `True`. + :math:`t \times hop\\_length` at the center of :math:`t`-th frame. Default: `True`. pad_mode : str, optional Choose padding pattern when `center` is `True`. Returns @@ -433,7 +741,8 @@ def weighted_mean(input, weight): Weighted mean tensor with the same dtype as input. """ weight = paddle.cast(weight, input.dtype) - broadcast_ratio = input.size / weight.size + # paddle.Tensor.size is different with torch.size() and has been overrided in s2t.__init__ + broadcast_ratio = input.numel() / weight.numel() return paddle.sum(input * weight) / (paddle.sum(weight) * broadcast_ratio) diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..710e326d6083da4aa550740d915afef0382a139e --- /dev/null +++ b/paddlespeech/t2s/modules/tacotron2/attentions.py @@ -0,0 +1,519 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Attention modules for RNN.""" +import paddle +import paddle.nn.functional as F +from paddle import nn + +from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import make_pad_mask + + +def _apply_attention_constraint(e, + last_attended_idx, + backward_window=1, + forward_window=3): + """Apply monotonic attention constraint. + + This function apply the monotonic attention constraint + introduced in `Deep Voice 3: Scaling + Text-to-Speech with Convolutional Sequence Learning`_. + + Parameters + ---------- + e : Tensor + Attention energy before applying softmax (1, T). + last_attended_idx : int + The index of the inputs of the last attended [0, T]. + backward_window : int, optional + Backward window size in attention constraint. + forward_window : int, optional + Forward window size in attetion constraint. + + Returns + ---------- + Tensor + Monotonic constrained attention energy (1, T). + + .. _`Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning`: + https://arxiv.org/abs/1710.07654 + + """ + if paddle.shape(e)[0] != 1: + raise NotImplementedError( + "Batch attention constraining is not yet supported.") + backward_idx = last_attended_idx - backward_window + forward_idx = last_attended_idx + forward_window + if backward_idx > 0: + e[:, :backward_idx] = -float("inf") + if forward_idx < paddle.shape(e)[1]: + e[:, forward_idx:] = -float("inf") + return e + + +class AttLoc(nn.Layer): + """location-aware attention module. + + Reference: Attention-Based Models for Speech Recognition + (https://arxiv.org/pdf/1506.07503.pdf) + Parameters + ---------- + eprojs : int + projection-units of encoder + dunits : int + units of decoder + att_dim : int + att_dim: attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + han_mode : bool + flag to swith on mode of hierarchical attention and not store pre_compute_enc_h + """ + + def __init__(self, + eprojs, + dunits, + att_dim, + aconv_chans, + aconv_filts, + han_mode=False): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.han_mode = han_mode + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=2.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttLoc forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len : paddle.Tensor + padded encoder hidden state length (B) + dec_z : paddle.Tensor dec_z + decoder hidden state (B, D_dec) + att_prev : paddle.Tensor + previous attention weight (B, T_max) + scaling : float + scaling parameter before applying softmax + forward_window : paddle.Tensor + forward window size when constraining attention + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, D_enc) + paddle.Tensor + previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None or self.han_mode: + # (utt, frame, hdim) + self.enc_h = enc_hs_pad + self.h_length = paddle.shape(self.enc_h)[1] + # (utt, frame, att_dim) + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + # initialize attention weight with uniform dist. + if att_prev is None: + # if no bias, 0 0-pad goes 0 + + att_prev = 1.0 - make_pad_mask(enc_hs_len) + att_prev = att_prev / enc_hs_len.unsqueeze(-1) + + # att_prev: (utt, frame) -> (utt, 1, 1, frame) + # -> (utt, att_conv_chans, 1, frame) + + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: (utt, att_conv_chans, 1, frame) -> (utt, frame, att_conv_chans) + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: (utt, frame, att_conv_chans) -> (utt, frame, att_dim) + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: (utt, frame, att_dim) + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # (utt, frame, att_dim) -> (utt, frame) + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # weighted sum over frames + # utt x hdim + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + return c, w + + +class AttForward(nn.Layer): + """Forward attention module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + + Parameters + ---------- + eprojs : int + projection-units of encoder + dunits : int + units of decoder + att_dim : int + attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + """ + + def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts): + super().__init__() + self.mlp_enc = nn.Linear(eprojs, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eprojs = eprojs + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def reset(self): + """reset states""" + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForward forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, T_max, D_enc) + enc_hs_len : list + padded encoder hidden state length (B,) + dec_z : paddle.Tensor + decoder hidden state (B, D_dec) + att_prev : paddle.Tensor + attention weights of previous step (B, T_max) + scaling : float + scaling parameter before applying softmax + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, D_enc) + paddle.Tensor + previous attention weights (B, T_max) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).unsqueeze(1) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(self.pre_compute_enc_h + dec_z_tiled + + att_conv)).squeeze(2) + + # NOTE: consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + + w = (att_prev + att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum(self.enc_h * w.unsqueeze(-1), axis=1) + + return c, w + + +class AttForwardTA(nn.Layer): + """Forward attention with transition agent module. + Reference + ---------- + Forward attention in sequence-to-sequence acoustic modeling for speech synthesis + (https://arxiv.org/pdf/1807.06736.pdf) + Parameters + ---------- + eunits : int + units of encoder + dunits : int + units of decoder + att_dim : int + attention dimension + aconv_chans : int + channels of attention convolution + aconv_filts : int + filter size of attention convolution + odim : int + output dimension + """ + + def __init__(self, eunits, dunits, att_dim, aconv_chans, aconv_filts, odim): + super().__init__() + self.mlp_enc = nn.Linear(eunits, att_dim) + self.mlp_dec = nn.Linear(dunits, att_dim, bias_attr=False) + self.mlp_ta = nn.Linear(eunits + dunits + odim, 1) + self.mlp_att = nn.Linear(aconv_chans, att_dim, bias_attr=False) + self.loc_conv = nn.Conv2D( + 1, + aconv_chans, + (1, 2 * aconv_filts + 1), + padding=(0, aconv_filts), + bias_attr=False, ) + self.gvec = nn.Linear(att_dim, 1) + self.dunits = dunits + self.eunits = eunits + self.att_dim = att_dim + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def reset(self): + self.h_length = None + self.enc_h = None + self.pre_compute_enc_h = None + self.mask = None + self.trans_agent_prob = 0.5 + + def forward( + self, + enc_hs_pad, + enc_hs_len, + dec_z, + att_prev, + out_prev, + scaling=1.0, + last_attended_idx=None, + backward_window=1, + forward_window=3, ): + """Calculate AttForwardTA forward propagation. + Parameters + ---------- + enc_hs_pad : paddle.Tensor + padded encoder hidden state (B, Tmax, eunits) + enc_hs_len : list paddle.Tensor + padded encoder hidden state length (B,) + dec_z : paddle.Tensor + decoder hidden state (B, dunits) + att_prev : paddle.Tensor + attention weights of previous step (B, T_max) + out_prev : paddle.Tensor + decoder outputs of previous step (B, odim) + scaling : float + scaling parameter before applying softmax + last_attended_idx : int + index of the inputs of the last attended + backward_window : int + backward window size in attention constraint + forward_window : int + forward window size in attetion constraint + Returns + ---------- + paddle.Tensor + attention weighted encoder state (B, dunits) + paddle.Tensor + previous attention weights (B, Tmax) + """ + batch = len(enc_hs_pad) + # pre-compute all h outside the decoder loop + if self.pre_compute_enc_h is None: + self.enc_h = enc_hs_pad # utt x frame x hdim + self.h_length = paddle.shape(self.enc_h)[1] + # utt x frame x att_dim + self.pre_compute_enc_h = self.mlp_enc(self.enc_h) + + if dec_z is None: + dec_z = paddle.zeros([batch, self.dunits]) + else: + dec_z = dec_z.reshape([batch, self.dunits]) + + if att_prev is None: + # initial attention will be [1, 0, 0, ...] + att_prev = paddle.zeros([*paddle.shape(enc_hs_pad)[:2]]) + att_prev[:, 0] = 1.0 + + # att_prev: utt x frame -> utt x 1 x 1 x frame + # -> utt x att_conv_chans x 1 x frame + att_conv = self.loc_conv(att_prev.reshape([batch, 1, 1, self.h_length])) + # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans + att_conv = att_conv.squeeze(2).transpose([0, 2, 1]) + # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim + att_conv = self.mlp_att(att_conv) + + # dec_z_tiled: utt x frame x att_dim + dec_z_tiled = self.mlp_dec(dec_z).reshape([batch, 1, self.att_dim]) + + # dot with gvec + # utt x frame x att_dim -> utt x frame + e = self.gvec( + paddle.tanh(att_conv + self.pre_compute_enc_h + + dec_z_tiled)).squeeze(2) + + # NOTE consider zero padding when compute w. + if self.mask is None: + self.mask = make_pad_mask(enc_hs_len) + e = masked_fill(e, self.mask, -float("inf")) + + # apply monotonic attention constraint (mainly for TTS) + if last_attended_idx is not None: + e = _apply_attention_constraint(e, last_attended_idx, + backward_window, forward_window) + + w = F.softmax(scaling * e, axis=1) + + # forward attention + # att_prev_shift = F.pad(att_prev.unsqueeze(0), (1, 0), data_format='NCL').squeeze(0)[:, :-1] + att_prev_shift = F.pad(att_prev, (0, 0, 1, 0))[:, :-1] + w = (self.trans_agent_prob * att_prev + + (1 - self.trans_agent_prob) * att_prev_shift) * w + # NOTE: clip is needed to avoid nan gradient + w = F.normalize(paddle.clip(w, 1e-6), p=1, axis=1) + + # weighted sum over flames + # utt x hdim + # NOTE use bmm instead of sum(*) + c = paddle.sum( + self.enc_h * w.reshape([batch, self.h_length, 1]), axis=1) + + # update transition agent prob + self.trans_agent_prob = F.sigmoid( + self.mlp_ta(paddle.concat([c, out_prev, dec_z], axis=1))) + + return c, w diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py index 691bb3ee29c31397400e9ea6b77d982a1b2f0f64..fc15adfda30a5ded3481fe570a59a41b60da2bcc 100644 --- a/paddlespeech/t2s/modules/tacotron2/decoder.py +++ b/paddlespeech/t2s/modules/tacotron2/decoder.py @@ -13,10 +13,13 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) """Tacotron2 decoder related modules.""" +import paddle import paddle.nn.functional as F import six from paddle import nn +from paddlespeech.t2s.modules.tacotron2.attentions import AttForwardTA + class Prenet(nn.Layer): """Prenet module for decoder of Spectrogram prediction network. @@ -196,3 +199,527 @@ class Postnet(nn.Layer): for i in six.moves.range(len(self.postnet)): xs = self.postnet[i](xs) return xs + + +class ZoneOutCell(nn.Layer): + """ZoneOut Cell module. + This is a module of zoneout described in + `Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`_. + This code is modified from `eladhoffer/seq2seq.pytorch`_. + Examples + ---------- + >>> lstm = paddle.nn.LSTMCell(16, 32) + >>> lstm = ZoneOutCell(lstm, 0.5) + .. _`Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations`: + https://arxiv.org/abs/1606.01305 + .. _`eladhoffer/seq2seq.pytorch`: + https://github.com/eladhoffer/seq2seq.pytorch + """ + + def __init__(self, cell, zoneout_rate=0.1): + """Initialize zone out cell module. + Parameters + ---------- + cell : nn.Layer: + Paddle recurrent cell module + e.g. `paddle.nn.LSTMCell`. + zoneout_rate : float, optional + Probability of zoneout from 0.0 to 1.0. + """ + super().__init__() + self.cell = cell + self.hidden_size = cell.hidden_size + self.zoneout_rate = zoneout_rate + if zoneout_rate > 1.0 or zoneout_rate < 0.0: + raise ValueError( + "zoneout probability must be in the range from 0.0 to 1.0.") + + def forward(self, inputs, hidden): + """Calculate forward propagation. + Parameters + ---------- + inputs : Tensor + Batch of input tensor (B, input_size). + hidden : tuple + - Tensor: Batch of initial hidden states (B, hidden_size). + - Tensor: Batch of initial cell states (B, hidden_size). + Returns + ---------- + Tensor + Batch of next hidden states (B, hidden_size). + tuple: + - Tensor: Batch of next hidden states (B, hidden_size). + - Tensor: Batch of next cell states (B, hidden_size). + """ + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.cell(inputs, hidden) + next_hidden = self._zoneout(hidden, next_hidden, self.zoneout_rate) + # to have the same output format with LSTMCell in paddle + return next_hidden[0], next_hidden + + def _zoneout(self, h, next_h, prob): + # apply recursively + if isinstance(h, tuple): + num_h = len(h) + if not isinstance(prob, tuple): + prob = tuple([prob] * num_h) + return tuple( + [self._zoneout(h[i], next_h[i], prob[i]) for i in range(num_h)]) + if self.training: + mask = paddle.bernoulli(paddle.ones([*paddle.shape(h)]) * prob) + return mask * h + (1 - mask) * next_h + else: + return prob * h + (1 - prob) * next_h + + +class Decoder(nn.Layer): + """Decoder module of Spectrogram prediction network. + This is a module of decoder of Spectrogram prediction network in Tacotron2, + which described in `Natural TTS + Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`_. + The decoder generates the sequence of + features from the sequence of the hidden states. + .. _`Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions`: + https://arxiv.org/abs/1712.05884 + """ + + def __init__( + self, + idim, + odim, + att, + dlayers=2, + dunits=1024, + prenet_layers=2, + prenet_units=256, + postnet_layers=5, + postnet_chans=512, + postnet_filts=5, + output_activation_fn=None, + cumulate_att_w=True, + use_batch_norm=True, + use_concate=True, + dropout_rate=0.5, + zoneout_rate=0.1, + reduction_factor=1, ): + """Initialize Tacotron2 decoder module. + Parameters + ---------- + idim : int + Dimension of the inputs. + odim : int + Dimension of the outputs. + att nn.Layer + Instance of attention class. + dlayers int, optional + The number of decoder lstm layers. + dunits : int, optional + The number of decoder lstm units. + prenet_layers : int, optional + The number of prenet layers. + prenet_units : int, optional + The number of prenet units. + postnet_layers : int, optional + The number of postnet layers. + postnet_filts : int, optional + The number of postnet filter size. + postnet_chans : int, optional + The number of postnet filter channels. + output_activation_fn : nn.Layer, optional + Activation function for outputs. + cumulate_att_w : bool, optional + Whether to cumulate previous attention weight. + use_batch_norm : bool, optional + Whether to use batch normalization. + use_concate : bool, optional + Whether to concatenate encoder embedding with decoder lstm outputs. + dropout_rate : float, optional + Dropout rate. + zoneout_rate : float, optional + Zoneout rate. + reduction_factor : int, optional + Reduction factor. + """ + super().__init__() + + # store the hyperparameters + self.idim = idim + self.odim = odim + self.att = att + self.output_activation_fn = output_activation_fn + self.cumulate_att_w = cumulate_att_w + self.use_concate = use_concate + self.reduction_factor = reduction_factor + + # check attention type + if isinstance(self.att, AttForwardTA): + self.use_att_extra_inputs = True + else: + self.use_att_extra_inputs = False + + # define lstm network + prenet_units = prenet_units if prenet_layers != 0 else odim + self.lstm = nn.LayerList() + for layer in six.moves.range(dlayers): + iunits = idim + prenet_units if layer == 0 else dunits + lstm = nn.LSTMCell(iunits, dunits) + if zoneout_rate > 0.0: + lstm = ZoneOutCell(lstm, zoneout_rate) + self.lstm.append(lstm) + + # define prenet + if prenet_layers > 0: + self.prenet = Prenet( + idim=odim, + n_layers=prenet_layers, + n_units=prenet_units, + dropout_rate=dropout_rate, ) + else: + self.prenet = None + + # define postnet + if postnet_layers > 0: + self.postnet = Postnet( + idim=idim, + odim=odim, + n_layers=postnet_layers, + n_chans=postnet_chans, + n_filts=postnet_filts, + use_batch_norm=use_batch_norm, + dropout_rate=dropout_rate, ) + else: + self.postnet = None + + # define projection layers + iunits = idim + dunits if use_concate else dunits + self.feat_out = nn.Linear( + iunits, odim * reduction_factor, bias_attr=False) + self.prob_out = nn.Linear(iunits, reduction_factor) + + # initialize + # self.apply(decoder_init) + + def _zero_state(self, hs): + init_hs = paddle.zeros([paddle.shape(hs)[0], self.lstm[0].hidden_size]) + return init_hs + + def forward(self, hs, hlens, ys): + """Calculate forward propagation. + Parameters + ---------- + hs : Tensor + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens : Tensor(int64) padded + Batch of lengths of each input batch (B,). + ys : Tensor + Batch of the sequences of padded target features (B, Lmax, odim). + Returns + ---------- + Tensor + Batch of output tensors after postnet (B, Lmax, odim). + Tensor + Batch of output tensors before postnet (B, Lmax, odim). + Tensor + Batch of logits of stop prediction (B, Lmax). + Tensor + Batch of attention weights (B, Lmax, Tmax). + Note + ---------- + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + # hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + outs, logits, att_ws = [], [], [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + outs += [ + self.feat_out(zcs).reshape([paddle.shape(hs)[0], self.odim, -1]) + ] + logits += [self.prob_out(zcs)] + att_ws += [att_w] + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + # (B, Lmax) + logits = paddle.concat(logits, axis=1) + # (B, odim, Lmax) + before_outs = paddle.concat(outs, axis=2) + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + if self.reduction_factor > 1: + # (B, odim, Lmax) + before_outs = before_outs.reshape( + [paddle.shape(before_outs)[0], self.odim, -1]) + + if self.postnet is not None: + # (B, odim, Lmax) + after_outs = before_outs + self.postnet(before_outs) + else: + after_outs = before_outs + # (B, Lmax, odim) + before_outs = before_outs.transpose([0, 2, 1]) + # (B, Lmax, odim) + after_outs = after_outs.transpose([0, 2, 1]) + logits = logits + + # apply activation function for scaling + if self.output_activation_fn is not None: + before_outs = self.output_activation_fn(before_outs) + after_outs = self.output_activation_fn(after_outs) + + return after_outs, before_outs, logits, att_ws + + def inference( + self, + h, + threshold=0.5, + minlenratio=0.0, + maxlenratio=10.0, + use_att_constraint=False, + backward_window=None, + forward_window=None, ): + """Generate the sequence of features given the sequences of characters. + Parameters + ---------- + h : Tensor + Input sequence of encoder hidden states (T, C). + threshold : float, optional + Threshold to stop generation. + minlenratio : float, optional + Minimum length ratio. + If set to 1.0 and the length of input is 10, + the minimum length of outputs will be 10 * 1 = 10. + minlenratio : float, optional + Minimum length ratio. + If set to 10 and the length of input is 10, + the maximum length of outputs will be 10 * 10 = 100. + use_att_constraint : bool + Whether to apply attention constraint introduced in `Deep Voice 3`_. + backward_window : int + Backward window size in attention constraint. + forward_window : int + Forward window size in attention constraint. + Returns + ---------- + Tensor + Output sequence of features (L, odim). + Tensor + Output sequence of stop probabilities (L,). + Tensor + Attention weights (L, T). + Note + ---------- + This computation is performed in auto-regressive manner. + .. _`Deep Voice 3`: https://arxiv.org/abs/1710.07654 + """ + # setup + assert len(paddle.shape(h)) == 2 + hs = h.unsqueeze(0) + ilens = paddle.shape(h)[0] + maxlen = int(paddle.shape(h)[0] * maxlenratio) + minlen = int(paddle.shape(h)[0] * minlenratio) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([1, self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # setup for attention constraint + if use_att_constraint: + last_attended_idx = 0 + else: + last_attended_idx = None + + # loop for an output sequence + idx = 0 + outs, att_ws, probs = [], [], [] + while True: + # updated index + idx += self.reduction_factor + + # decoder calculation + if self.use_att_extra_inputs: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_w, + prev_out, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + else: + att_c, att_w = self.att( + hs, + ilens, + z_list[0], + prev_att_w, + last_attended_idx=last_attended_idx, + backward_window=backward_window, + forward_window=forward_window, ) + + att_ws += [att_w] + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + z_list[i], c_list[i] = next_hidden + zcs = (paddle.concat([z_list[-1], att_c], axis=1) + if self.use_concate else z_list[-1]) + # [(1, odim, r), ...] + outs += [self.feat_out(zcs).reshape([1, self.odim, -1])] + + # [(r), ...] + probs += [F.sigmoid(self.prob_out(zcs))[0]] + if self.output_activation_fn is not None: + prev_out = self.output_activation_fn( + outs[-1][:, :, -1]) # (1, odim) + else: + prev_out = outs[-1][:, :, -1] # (1, odim) + if self.cumulate_att_w and prev_att_w is not None: + prev_att_w = prev_att_w + att_w # Note: error when use += + else: + prev_att_w = att_w + if use_att_constraint: + last_attended_idx = int(att_w.argmax()) + + # check whether to finish generation + if sum(paddle.cast(probs[-1] >= threshold, + 'int64')) > 0 or idx >= maxlen: + # check mininum length + if idx < minlen: + continue + # (1, odim, L) + outs = paddle.concat(outs, axis=2) + if self.postnet is not None: + # (1, odim, L) + outs = outs + self.postnet(outs) + # (L, odim) + outs = outs.transpose([0, 2, 1]).squeeze(0) + probs = paddle.concat(probs, axis=0) + att_ws = paddle.concat(att_ws, axis=0) + break + + if self.output_activation_fn is not None: + outs = self.output_activation_fn(outs) + + return outs, probs, att_ws + + def calculate_all_attentions(self, hs, hlens, ys): + """Calculate all of the attention weights. + Parameters + ---------- + hs : Tensor + Batch of the sequences of padded hidden states (B, Tmax, idim). + hlens : Tensor(int64) + Batch of lengths of each input batch (B,). + ys : Tensor + Batch of the sequences of padded target features (B, Lmax, odim). + Returns + ---------- + numpy.ndarray + Batch of attention weights (B, Lmax, Tmax). + Note + ---------- + This computation is performed in teacher-forcing manner. + """ + # thin out frames (B, Lmax, odim) -> (B, Lmax/r, odim) + if self.reduction_factor > 1: + ys = ys[:, self.reduction_factor - 1::self.reduction_factor] + + # length list should be list of int + hlens = list(map(int, hlens)) + + # initialize hidden states of decoder + c_list = [self._zero_state(hs)] + z_list = [self._zero_state(hs)] + for _ in six.moves.range(1, len(self.lstm)): + c_list += [self._zero_state(hs)] + z_list += [self._zero_state(hs)] + prev_out = paddle.zeros([paddle.shape(hs)[0], self.odim]) + + # initialize attention + prev_att_w = None + self.att.reset() + + # loop for an output sequence + att_ws = [] + for y in ys.transpose([1, 0, 2]): + if self.use_att_extra_inputs: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w, + prev_out) + else: + att_c, att_w = self.att(hs, hlens, z_list[0], prev_att_w) + att_ws += [att_w] + prenet_out = self.prenet( + prev_out) if self.prenet is not None else prev_out + xs = paddle.concat([att_c, prenet_out], axis=1) + # we only use the second output of LSTMCell in paddle + _, next_hidden = self.lstm[0](xs, (z_list[0], c_list[0])) + z_list[0], c_list[0] = next_hidden + for i in six.moves.range(1, len(self.lstm)): + z_list[i], c_list[i] = self.lstm[i](z_list[i - 1], + (z_list[i], c_list[i])) + # teacher forcing + prev_out = y + if self.cumulate_att_w and prev_att_w is not None: + # Note: error when use += + prev_att_w = prev_att_w + att_w + else: + prev_att_w = att_w + # (B, Lmax, Tmax) + att_ws = paddle.stack(att_ws, axis=1) + + return att_ws diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py index f1889061396b3ee6ce62026a0a0e039d61cfdba0..b2ed30d1f1cebc666e68c8555c4d69dfd1140331 100644 --- a/paddlespeech/t2s/modules/tacotron2/encoder.py +++ b/paddlespeech/t2s/modules/tacotron2/encoder.py @@ -145,16 +145,15 @@ class Encoder(nn.Layer): Batch of the padded sequence. Either character ids (B, Tmax) or acoustic feature (B, Tmax, idim * encoder_reduction_factor). Padded value should be 0. - ilens : LongTensor + ilens : Tensor(int64) Batch of lengths of each input batch (B,). Returns ---------- Tensor Batch of the sequences of encoder states(B, Tmax, eunits). - LongTensor + Tensor(int64) Batch of lengths of each sequence (B,) - """ xs = self.embed(xs).transpose([0, 2, 1]) if self.convs is not None: @@ -170,8 +169,8 @@ class Encoder(nn.Layer): xs = xs.transpose([0, 2, 1]) self.blstm.flatten_parameters() # (B, Tmax, C) - xs, _ = self.blstm(xs) - # hlens 是什么 + # see https://www.paddlepaddle.org.cn/documentation/docs/zh/faq/train_cn.html#paddletorch-nn-utils-rnn-pack-padded-sequencetorch-nn-utils-rnn-pad-packed-sequenceapi + xs, _ = self.blstm(xs, sequence_length=ilens) hlens = ilens return xs, hlens diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py index 907e3dafafb2ea2d78919f9d5042d7b8d402872c..64274d5380bffcdd483fbaca2a5448a77f6611ee 100644 --- a/paddlespeech/t2s/training/optimizer.py +++ b/paddlespeech/t2s/training/optimizer.py @@ -26,10 +26,13 @@ optim_classes = dict( sgd=paddle.optimizer.SGD, ) -def build_optimizers(model: nn.Layer, - optim='adadelta', - max_grad_norm=None, - learning_rate=0.01) -> paddle.optimizer: +def build_optimizers( + model: nn.Layer, + optim='adadelta', + max_grad_norm=None, + learning_rate=0.01, + weight_decay=None, + epsilon=1.0e-6, ) -> paddle.optimizer: optim_class = optim_classes.get(optim) if optim_class is None: raise ValueError(f"must be one of {list(optim_classes)}: {optim}") @@ -37,10 +40,13 @@ def build_optimizers(model: nn.Layer, grad_clip = None if max_grad_norm: grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm) - optim = optim_class( - parameters=model.parameters(), - learning_rate=learning_rate, - grad_clip=grad_clip) + optim_dict = {} + optim_dict['parameters'] = model.parameters() + optim_dict['learning_rate'] = learning_rate + optim_dict['grad_clip'] = grad_clip + optim_dict['weight_decay'] = weight_decay + if optim_class not in {'momentum', 'sgd'}: + optim_dict['epsilon'] = epsilon + optimizers = optim_class(**optim_dict) - optimizers = optim return optimizers