未验证 提交 175c39b4 编写于 作者: 小湉湉's avatar 小湉湉 提交者: GitHub

Merge pull request #1511 from yt605155624/pre_fix_for_streaming

[TTS]add rtf for synthesize, add more vocoder for synthesize.sh
...@@ -3,10 +3,14 @@ ...@@ -3,10 +3,14 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ # pwgan
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \ --am=tacotron2_csmsc \
--am_config=${config_path} \ --am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
...@@ -18,3 +22,79 @@ python3 ${BIN_DIR}/../synthesize.py \ ...@@ -18,3 +22,79 @@ python3 ${BIN_DIR}/../synthesize.py \
--test_metadata=dump/test/norm/metadata.jsonl \ --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
...@@ -8,6 +8,7 @@ stage=0 ...@@ -8,6 +8,7 @@ stage=0
stop_stage=0 stop_stage=0
# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 # TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
...@@ -39,14 +40,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -39,14 +40,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
...@@ -88,8 +89,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -88,8 +89,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# wavernn # wavernn
......
#!/bin/bash #!/bin/bash
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ # pwgan
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \ --am=speedyspeech_csmsc \
--am_config=${config_path} \ --am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \ --voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
...@@ -18,3 +23,83 @@ python3 ${BIN_DIR}/../synthesize.py \ ...@@ -18,3 +23,83 @@ python3 ${BIN_DIR}/../synthesize.py \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=speedyspeech_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--tones_dict=dump/tone_id_map.txt \
--phones_dict=dump/phone_id_map.txt
fi
...@@ -7,6 +7,7 @@ ckpt_name=$3 ...@@ -7,6 +7,7 @@ ckpt_name=$3
stage=0 stage=0
stop_stage=0 stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
...@@ -22,9 +23,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -22,9 +23,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# for more GAN Vocoders # for more GAN Vocoders
...@@ -44,9 +45,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -44,9 +45,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
...@@ -88,12 +89,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -88,12 +89,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi fi
# wavernn # wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn_e2e" echo "in wavernn syn_e2e"
......
...@@ -3,10 +3,14 @@ ...@@ -3,10 +3,14 @@
config_path=$1 config_path=$1
train_output_path=$2 train_output_path=$2
ckpt_name=$3 ckpt_name=$3
stage=0
stop_stage=0
FLAGS_allocator_strategy=naive_best_fit \ # pwgan
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../synthesize.py \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \ --am=fastspeech2_csmsc \
--am_config=${config_path} \ --am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
...@@ -18,3 +22,79 @@ python3 ${BIN_DIR}/../synthesize.py \ ...@@ -18,3 +22,79 @@ python3 ${BIN_DIR}/../synthesize.py \
--test_metadata=dump/test/norm/metadata.jsonl \ --test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \ --output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# style melgan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
--voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
--voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi
...@@ -7,6 +7,7 @@ ckpt_name=$3 ...@@ -7,6 +7,7 @@ ckpt_name=$3
stage=0 stage=0
stop_stage=0 stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \ FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
...@@ -22,8 +23,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -22,8 +23,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# for more GAN Vocoders # for more GAN Vocoders
...@@ -43,8 +44,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -43,8 +44,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
# the pretrained models haven't release now # the pretrained models haven't release now
...@@ -86,8 +87,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -86,8 +87,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \
--phones_dict=dump/phone_id_map.txt --inference_dir=${train_output_path}/inference
fi fi
......
...@@ -20,6 +20,7 @@ import numpy as np ...@@ -20,6 +20,7 @@ import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml import yaml
from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
...@@ -50,6 +51,18 @@ model_alias = { ...@@ -50,6 +51,18 @@ model_alias = {
"paddlespeech.t2s.models.melgan:MelGANGenerator", "paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference": "mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference", "paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
} }
...@@ -146,10 +159,15 @@ def evaluate(args): ...@@ -146,10 +159,15 @@ def evaluate(args):
voc_name = args.voc[:args.voc.rindex('_')] voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias) voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
if voc_name != 'wavernn':
voc = voc_class(**voc_config["generator_params"]) voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm() voc.remove_weight_norm()
voc.eval() voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat) voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu) voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std) voc_std = paddle.to_tensor(voc_std)
...@@ -162,8 +180,12 @@ def evaluate(args): ...@@ -162,8 +180,12 @@ def evaluate(args):
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
N = 0
T = 0
for datum in test_dataset: for datum in test_dataset:
utt_id = datum["utt_id"] utt_id = datum["utt_id"]
with timer() as t:
with paddle.no_grad(): with paddle.no_grad():
# acoustic model # acoustic model
if am_name == 'fastspeech2': if am_name == 'fastspeech2':
...@@ -175,7 +197,8 @@ def evaluate(args): ...@@ -175,7 +197,8 @@ def evaluate(args):
spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
elif "spk_id" in datum: elif "spk_id" in datum:
spk_id = paddle.to_tensor(datum["spk_id"]) spk_id = paddle.to_tensor(datum["spk_id"])
mel = am_inference(phone_ids, spk_id=spk_id, spk_emb=spk_emb) mel = am_inference(
phone_ids, spk_id=spk_id, spk_emb=spk_emb)
elif am_name == 'speedyspeech': elif am_name == 'speedyspeech':
phone_ids = paddle.to_tensor(datum["phones"]) phone_ids = paddle.to_tensor(datum["phones"])
tone_ids = paddle.to_tensor(datum["tones"]) tone_ids = paddle.to_tensor(datum["tones"])
...@@ -189,11 +212,19 @@ def evaluate(args): ...@@ -189,11 +212,19 @@ def evaluate(args):
mel = am_inference(phone_ids, spk_emb=spk_emb) mel = am_inference(phone_ids, spk_emb=spk_emb)
# vocoder # vocoder
wav = voc_inference(mel) wav = voc_inference(mel)
wav = wav.numpy()
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
wav.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def main():
...@@ -246,7 +277,8 @@ def main(): ...@@ -246,7 +277,8 @@ def main():
default='pwgan_csmsc', default='pwgan_csmsc',
choices=[ choices=[
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
'mb_melgan_csmsc' 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
'style_melgan_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
......
...@@ -21,6 +21,7 @@ import soundfile as sf ...@@ -21,6 +21,7 @@ import soundfile as sf
import yaml import yaml
from paddle import jit from paddle import jit
from paddle.static import InputSpec from paddle.static import InputSpec
from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
...@@ -233,8 +234,10 @@ def evaluate(args): ...@@ -233,8 +234,10 @@ def evaluate(args):
# but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
if am_name == 'tacotron2': if am_name == 'tacotron2':
merge_sentences = True merge_sentences = True
N = 0
T = 0
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
with timer() as t:
get_tone_ids = False get_tone_ids = False
if am_name == 'speedyspeech': if am_name == 'speedyspeech':
get_tone_ids = True get_tone_ids = True
...@@ -281,11 +284,18 @@ def evaluate(args): ...@@ -281,11 +284,18 @@ def evaluate(args):
flags = 1 flags = 1
else: else:
wav_all = paddle.concat([wav_all, wav]) wav_all = paddle.concat([wav_all, wav])
wav = wav_all.numpy()
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write( sf.write(
str(output_dir / (utt_id + ".wav")), str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
wav_all.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!") print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
def main(): def main():
......
...@@ -91,7 +91,7 @@ def main(): ...@@ -91,7 +91,7 @@ def main():
target=config.inference.target, target=config.inference.target,
overlap=config.inference.overlap, overlap=config.inference.overlap,
mu_law=config.mu_law, mu_law=config.mu_law,
gen_display=True) gen_display=False)
wav = wav.numpy() wav = wav.numpy()
N += wav.size N += wav.size
T += t.elapse T += t.elapse
......
...@@ -66,7 +66,7 @@ class MelGANGenerator(nn.Layer): ...@@ -66,7 +66,7 @@ class MelGANGenerator(nn.Layer):
nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network,
by default {} by default {}
pad (str): Padding function module name before dilated convolution layer. pad (str): Padding function module name before dilated convolution layer.
pad_params dict): Hyperparameters for padding function. pad_params (dict): Hyperparameters for padding function.
use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. use_final_nonlinear_activation (nn.Layer): Activation function for the final layer.
use_weight_norm (bool): Whether to use weight norm. use_weight_norm (bool): Whether to use weight norm.
If set to true, it will be applied to all of the conv layers. If set to true, it will be applied to all of the conv layers.
......
...@@ -509,16 +509,20 @@ class WaveRNN(nn.Layer): ...@@ -509,16 +509,20 @@ class WaveRNN(nn.Layer):
total_len = num_folds * (target + overlap) + overlap total_len = num_folds * (target + overlap) + overlap
# Need some silence for the run warmup # Need some silence for the run warmup
slience_len = overlap // 2 slience_len = 0
linear_len = slience_len
fade_len = overlap - slience_len fade_len = overlap - slience_len
slience = paddle.zeros([slience_len], dtype=paddle.float32) slience = paddle.zeros([slience_len], dtype=paddle.float32)
linear = paddle.ones([fade_len], dtype=paddle.float32) linear = paddle.ones([linear_len], dtype=paddle.float32)
# Equal power crossfade # Equal power crossfade
# fade_in increase from 0 to 1, fade_out reduces from 1 to 0 # fade_in increase from 0 to 1, fade_out reduces from 1 to 0
t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) sigmoid_scale = 2.3
fade_in = paddle.sqrt(0.5 * (1 + t)) t = paddle.linspace(
fade_out = paddle.sqrt(0.5 * (1 - t)) -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32)
# sigmoid 曲线应该更好
fade_in = paddle.nn.functional.sigmoid(t)
fade_out = 1 - paddle.nn.functional.sigmoid(t)
# Concat the silence to the fades # Concat the silence to the fades
fade_out = paddle.concat([linear, fade_out]) fade_out = paddle.concat([linear, fade_out])
fade_in = paddle.concat([slience, fade_in]) fade_in = paddle.concat([slience, fade_in])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册