未验证 提交 dcfc32f1 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #1379 from yt605155624/new_wavernn

[TTS] add wavernn
...@@ -49,3 +49,14 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -49,3 +49,14 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--output_dir=${train_output_path}/pd_infer_out \ --output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=wavernn_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
\ No newline at end of file
...@@ -89,3 +89,25 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -89,3 +89,25 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--inference_dir=${train_output_path}/inference \ --inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi fi
# wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "in wavernn syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=wavernn_csmsc \
--voc_config=wavernn_test/default.yaml \
--voc_ckpt=wavernn_test/snapshot_iter_5000.pdz \
--voc_stat=wavernn_test/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt \
--inference_dir=${train_output_path}/inference
fi
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
fs: 24000 # Sampling rate.
n_fft: 2048 # FFT size (samples).
n_shift: 300 # Hop size (samples). 12.5ms
win_length: 1200 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.
n_mels: 80 # Number of mel basis.
fmin: 80 # Minimum freq in mel basis calculation. (Hz)
fmax: 7600 # Maximum frequency in mel basis calculation. (Hz)
mu_law: True # Recommended to suppress noise if using raw bitsexit()
###########################################################
# MODEL SETTING #
###########################################################
model:
rnn_dims: 512 # Hidden dims of RNN Layers.
fc_dims: 512
bits: 9 # Bit depth of signal
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
aux_channels: 80 # Number of channels for auxiliary feature conv.
# Must be the same as num_mels.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. Prodcut of these must be the same as hop size, same with pwgan here
compute_dims: 128 # Dims of Conv1D in MelResNet.
res_out_dims: 128 # Dims of output in MelResNet.
res_blocks: 10 # Number of residual blocks.
mode: RAW # either 'raw'(softmax on raw bits) or 'mold' (sample from mixture of logistics)
inference:
gen_batched: True # whether to genenate sample in batch mode
target: 12000 # target number of samples to be generated in each batch entry
overlap: 600 # number of samples for crossfading between batches
###########################################################
# DATA LOADER SETTING #
###########################################################
batch_size: 64 # Batch size.
batch_max_steps: 4500 # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2 # Number of workers in DataLoader.
###########################################################
# OPTIMIZER SETTING #
###########################################################
grad_clip: 4.0
learning_rate: 1.0e-4
###########################################################
# INTERVAL SETTING #
###########################################################
train_max_steps: 400000 # Number of training steps.
save_interval_steps: 5000 # Interval steps to save checkpoint.
eval_interval_steps: 1000 # Interval steps to evaluate the network.
gen_eval_samples_interval_steps: 5000 # the iteration interval of generating valid samples
generate_num: 5 # number of samples to generate at each checkpoint
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots: 10 # max number of snapshots to keep while training
seed: 42 # random seed for paddle, random, and np.random
#!/bin/bash
stage=0
stop_stage=100
config_path=$1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
echo "Generate durations.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./baker_alignment_tone \
--output=durations.txt \
--config=${config_path}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \
--rootdir=~/datasets/BZNSYP/ \
--dataset=baker \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--cut-sil=True \
--num-cpu=20
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="feats"
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--stats=dump/train/feats_stats.npy
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--stats=dump/train/feats_stats.npy
python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--stats=dump/train/feats_stats.npy
fi
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/synthesize.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--test-metadata=dump/test/norm/metadata.jsonl \
--output-dir=${train_output_path}/test
#!/bin/bash
config_path=$1
train_output_path=$2
FLAGS_cudnn_exhaustive_search=true \
FLAGS_conv_workspace_size_limit=4000 \
python ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
MODEL=wavernn
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
#!/bin/bash
set -e
source path.sh
gpus=0,1
stage=0
stop_stage=100
conf_path=conf/default.yaml
train_output_path=exp/default
test_input=dump/dump_gta_test
ckpt_name=snapshot_iter_100000.pdz
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
./local/preprocess.sh ${conf_path} || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# prepare data
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi
...@@ -12,5 +12,6 @@ ...@@ -12,5 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .audio import AudioProcessor from .audio import AudioProcessor
from .codec import *
from .spec_normalizer import LogMagnitude from .spec_normalizer import LogMagnitude
from .spec_normalizer import NormalizerBase from .spec_normalizer import NormalizerBase
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import paddle
# x: [0: 2**bit-1], return: [-1, 1]
def label_2_float(x, bits):
return 2 * x / (2**bits - 1.) - 1.
#x: [-1, 1], return: [0, 2**bits-1]
def float_2_label(x, bits):
assert abs(x).max() <= 1.0
x = (x + 1.) * (2**bits - 1) / 2
return x.clip(0, 2**bits - 1)
# y: [-1, 1], mu: 2**bits, return: [0, 2**bits-1]
# see https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
# be careful the input `mu` here, which is +1 than that of the link above
def encode_mu_law(x, mu):
mu = mu - 1
fx = np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)
return np.floor((fx + 1) / 2 * mu + 0.5)
# from_labels = True:
# y: [0: 2**bit-1], mu: 2**bits, return: [-1,1]
# from_labels = False:
# y: [-1, 1], return: [-1, 1]
def decode_mu_law(y, mu, from_labels=True):
# TODO: get rid of log2 - makes no sense
if from_labels:
y = label_2_float(y, math.log2(mu))
mu = mu - 1
x = paddle.sign(y) / mu * ((1 + mu)**paddle.abs(y) - 1)
return x
...@@ -14,6 +14,10 @@ ...@@ -14,6 +14,10 @@
import numpy as np import numpy as np
import paddle import paddle
from paddlespeech.t2s.audio.codec import encode_mu_law
from paddlespeech.t2s.audio.codec import float_2_label
from paddlespeech.t2s.audio.codec import label_2_float
class Clip(object): class Clip(object):
"""Collate functor for training vocoders. """Collate functor for training vocoders.
...@@ -49,7 +53,7 @@ class Clip(object): ...@@ -49,7 +53,7 @@ class Clip(object):
self.end_offset = -(self.batch_max_frames + aux_context_window) self.end_offset = -(self.batch_max_frames + aux_context_window)
self.mel_threshold = self.batch_max_frames + 2 * aux_context_window self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
def __call__(self, examples): def __call__(self, batch):
"""Convert into batch tensors. """Convert into batch tensors.
Parameters Parameters
...@@ -67,11 +71,11 @@ class Clip(object): ...@@ -67,11 +71,11 @@ class Clip(object):
""" """
# check length # check length
examples = [ batch = [
self._adjust_length(b['wave'], b['feats']) for b in examples self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold if b['feats'].shape[0] > self.mel_threshold
] ]
xs, cs = [b[0] for b in examples], [b[1] for b in examples] xs, cs = [b[0] for b in batch], [b[1] for b in batch]
# make batch with random cut # make batch with random cut
c_lengths = [c.shape[0] for c in cs] c_lengths = [c.shape[0] for c in cs]
...@@ -89,7 +93,7 @@ class Clip(object): ...@@ -89,7 +93,7 @@ class Clip(object):
c_batch = np.stack( c_batch = np.stack(
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)]) [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
# convert each batch to tensor, asuume that each item in batch has the same length # convert each batch to tensor, assume that each item in batch has the same length
y_batch = paddle.to_tensor( y_batch = paddle.to_tensor(
y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T) y_batch, dtype=paddle.float32).unsqueeze(1) # (B, 1, T)
c_batch = paddle.to_tensor( c_batch = paddle.to_tensor(
...@@ -120,3 +124,113 @@ class Clip(object): ...@@ -120,3 +124,113 @@ class Clip(object):
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})" 0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
return x, c return x, c
class WaveRNNClip(Clip):
def __init__(self,
mode: str='RAW',
batch_max_steps: int=4500,
hop_size: int=300,
aux_context_window: int=2,
bits: int=9,
mu_law: bool=True):
self.mode = mode
self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
self.batch_max_steps = batch_max_steps
self.hop_size = hop_size
self.aux_context_window = aux_context_window
self.mu_law = mu_law
self.batch_max_frames = batch_max_steps // hop_size
self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
if self.mode == 'MOL':
self.bits = 16
else:
self.bits = bits
def to_quant(self, wav):
if self.mode == 'RAW':
if self.mu_law:
quant = encode_mu_law(wav, mu=2**self.bits)
else:
quant = float_2_label(wav, bits=self.bits)
elif self.mode == 'MOL':
quant = float_2_label(wav, bits=16)
quant = quant.astype(np.int64)
return quant
def __call__(self, batch):
# voc_pad = 2 this will pad the input so that the resnet can 'see' wider than input length
# max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
"""Convert into batch tensors.
Parameters
----------
batch : list
list of tuple of the pair of audio and features.
Audio shape (T, ), features shape(T', C).
Returns
----------
Tensor
Input signal batch (B, 1, T).
Tensor
Target signal batch (B, 1, T).
Tensor
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
"""
# check length
batch = [
self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold
]
wav, mel = [b[0] for b in batch], [b[1] for b in batch]
# mel 此处需要转置
mel = [x.T for x in mel]
max_offsets = [
x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
for x in mel
]
# the slice point of mel selecting randomly
mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
# the slice point of wav selecting randomly, which is behind 2(=pad) frames
sig_offsets = [(offset + self.aux_context_window) * self.hop_size
for offset in mel_offsets]
# mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad
mels = [
x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
for i, x in enumerate(mel)
]
# label.shape[1] = voc_seq_len + 1
wav = [self.to_quant(x) for x in wav]
labels = [
x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
for i, x in enumerate(wav)
]
mels = np.stack(mels).astype(np.float32)
labels = np.stack(labels).astype(np.int64)
mels = paddle.to_tensor(mels)
labels = paddle.to_tensor(labels, dtype='int64')
# x is input, y is label
x = labels[:, :self.batch_max_steps]
y = labels[:, 1:]
'''
mode = RAW:
mu_law = True:
quant: bits = 9 0, 1, 2, ..., 509, 510, 511 int
mu_law = False
quant bits = 9 [0, 511] float
mode = MOL:
quant: bits = 16 [0. 65536] float
'''
# x should be normalizes in.[0, 1] in RAW mode
x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
# y should be normalizes in.[0, 1] in MOL mode
if self.mode == 'MOL':
y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
return x, y, mels
...@@ -54,7 +54,7 @@ def main(): ...@@ -54,7 +54,7 @@ def main():
default='pwgan_csmsc', default='pwgan_csmsc',
choices=[ choices=[
'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3', 'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
'pwgan_vctk' 'pwgan_vctk', 'wavernn_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
# other # other
......
...@@ -59,6 +59,10 @@ model_alias = { ...@@ -59,6 +59,10 @@ model_alias = {
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator", "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference": "hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference", "paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
} }
...@@ -151,10 +155,16 @@ def evaluate(args): ...@@ -151,10 +155,16 @@ def evaluate(args):
voc_name = args.voc[:args.voc.rindex('_')] voc_name = args.voc[:args.voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias) voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
if voc_name != 'wavernn':
voc = voc_class(**voc_config["generator_params"]) voc = voc_class(**voc_config["generator_params"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
voc.remove_weight_norm() voc.remove_weight_norm()
voc.eval() voc.eval()
else:
voc = voc_class(**voc_config["model"])
voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
voc.eval()
voc_mu, voc_std = np.load(args.voc_stat) voc_mu, voc_std = np.load(args.voc_stat)
voc_mu = paddle.to_tensor(voc_mu) voc_mu = paddle.to_tensor(voc_mu)
voc_std = paddle.to_tensor(voc_std) voc_std = paddle.to_tensor(voc_std)
...@@ -322,7 +332,8 @@ def main(): ...@@ -322,7 +332,8 @@ def main():
default='pwgan_csmsc', default='pwgan_csmsc',
choices=[ choices=[
'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc',
'wavernn_csmsc'
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from pathlib import Path
import jsonlines
import numpy as np
import paddle
import soundfile as sf
import yaml
from paddle import distributed as dist
from timer import timer
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.wavernn import WaveRNN
def main():
parser = argparse.ArgumentParser(description="Synthesize with WaveRNN.")
parser.add_argument("--config", type=str, help="GANVocoder config file.")
parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
parser.add_argument("--test-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
args = parser.parse_args()
with open(args.config) as f:
config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(config)
print(
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
)
if args.ngpu == 0:
paddle.set_device("cpu")
elif args.ngpu > 0:
paddle.set_device("gpu")
else:
print("ngpu should >= 0 !")
model = WaveRNN(
hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
state_dict = paddle.load(args.checkpoint)
model.set_state_dict(state_dict["main_params"])
model.eval()
with jsonlines.open(args.test_metadata, 'r') as reader:
metadata = list(reader)
test_dataset = DataTable(
metadata,
fields=['utt_id', 'feats'],
converters={
'utt_id': None,
'feats': np.load,
})
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
N = 0
T = 0
for example in test_dataset:
utt_id = example['utt_id']
mel = example['feats']
mel = paddle.to_tensor(mel) # (T, C)
with timer() as t:
with paddle.no_grad():
wav = model.generate(
c=mel,
batched=config.inference.gen_batched,
target=config.inference.target,
overlap=config.inference.overlap,
mu_law=config.mu_law,
gen_display=True)
wav = wav.numpy()
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = config.fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
if __name__ == "__main__":
main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import shutil
from pathlib import Path
import jsonlines
import numpy as np
import paddle
import yaml
from paddle import DataParallel
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
from paddle.optimizer import Adam
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip
from paddlespeech.t2s.models.wavernn import WaveRNN
from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator
from paddlespeech.t2s.models.wavernn import WaveRNNUpdater
from paddlespeech.t2s.modules.losses import discretized_mix_logistic_loss
from paddlespeech.t2s.training.extensions.snapshot import Snapshot
from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
world_size = paddle.distributed.get_world_size()
if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
paddle.set_device("cpu")
else:
paddle.set_device("gpu")
if world_size > 1:
paddle.distributed.init_parallel_env()
# set the random seed, it is a must for multiprocess training
seed_everything(config.seed)
print(
f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
)
# construct dataset for training and validation
with jsonlines.open(args.train_metadata, 'r') as reader:
train_metadata = list(reader)
train_dataset = DataTable(
data=train_metadata,
fields=["wave", "feats"],
converters={
"wave": np.load,
"feats": np.load,
}, )
with jsonlines.open(args.dev_metadata, 'r') as reader:
dev_metadata = list(reader)
dev_dataset = DataTable(
data=dev_metadata,
fields=["wave", "feats"],
converters={
"wave": np.load,
"feats": np.load,
}, )
batch_fn = WaveRNNClip(
mode=config.model.mode,
aux_context_window=config.model.aux_context_window,
hop_size=config.n_shift,
batch_max_steps=config.batch_max_steps,
bits=config.model.bits)
# collate function and dataloader
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
drop_last=True)
dev_sampler = DistributedBatchSampler(
dev_dataset,
batch_size=config.batch_size,
shuffle=False,
drop_last=False)
print("samplers done!")
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
collate_fn=batch_fn,
num_workers=config.num_workers)
dev_dataloader = DataLoader(
dev_dataset,
collate_fn=batch_fn,
batch_sampler=dev_sampler,
num_workers=config.num_workers)
valid_generate_loader = DataLoader(dev_dataset, batch_size=1)
print("dataloaders done!")
model = WaveRNN(
hop_length=config.n_shift, sample_rate=config.fs, **config["model"])
if world_size > 1:
model = DataParallel(model)
print("model done!")
if config.model.mode == 'RAW':
criterion = paddle.nn.CrossEntropyLoss(axis=1)
elif config.model.mode == 'MOL':
criterion = discretized_mix_logistic_loss
else:
criterion = None
RuntimeError('Unknown model mode value - ', config.model.mode)
print("criterions done!")
clip = paddle.nn.ClipGradByGlobalNorm(config.grad_clip)
optimizer = Adam(
parameters=model.parameters(),
learning_rate=config.learning_rate,
grad_clip=clip)
print("optimizer done!")
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if dist.get_rank() == 0:
config_name = args.config.split("/")[-1]
# copy conf to output_dir
shutil.copyfile(args.config, output_dir / config_name)
updater = WaveRNNUpdater(
model=model,
optimizer=optimizer,
criterion=criterion,
dataloader=train_dataloader,
output_dir=output_dir,
mode=config.model.mode)
evaluator = WaveRNNEvaluator(
model=model,
dataloader=dev_dataloader,
criterion=criterion,
output_dir=output_dir,
valid_generate_loader=valid_generate_loader,
config=config)
trainer = Trainer(
updater,
stop_trigger=(config.train_max_steps, "iteration"),
out=output_dir)
if dist.get_rank() == 0:
trainer.extend(
evaluator, trigger=(config.eval_interval_steps, 'iteration'))
trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
trainer.extend(
Snapshot(max_size=config.num_snapshots),
trigger=(config.save_interval_steps, 'iteration'))
print("Trainer Done!")
trainer.run()
def main():
# parse args and config and redirect to train_sp
parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
parser.add_argument(
"--config", type=str, help="config file to overwrite default config.")
parser.add_argument("--train-metadata", type=str, help="training data.")
parser.add_argument("--dev-metadata", type=str, help="dev data.")
parser.add_argument("--output-dir", type=str, help="output dir.")
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
args = parser.parse_args()
with open(args.config, 'rt') as f:
config = CfgNode(yaml.safe_load(f))
print("========Args========")
print(yaml.safe_dump(vars(args)))
print("========Config========")
print(config)
print(
f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
)
# dispatch
if args.ngpu > 1:
dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
else:
train_sp(args, config)
if __name__ == "__main__":
main()
...@@ -19,3 +19,4 @@ from .parallel_wavegan import * ...@@ -19,3 +19,4 @@ from .parallel_wavegan import *
from .speedyspeech import * from .speedyspeech import *
from .transformer_tts import * from .transformer_tts import *
from .waveflow import * from .waveflow import *
from .wavernn import *
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .wavernn import *
from .wavernn_updater import *
此差异已折叠。
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
import paddle
import soundfile as sf
from paddle import distributed as dist
from paddle.io import DataLoader
from paddle.nn import Layer
from paddle.optimizer import Optimizer
from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
from paddlespeech.t2s.training.reporter import report
from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
logging.basicConfig(
format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
datefmt='[%Y-%m-%d %H:%M:%S]')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
def calculate_grad_norm(parameters, norm_type: str=2):
'''
calculate grad norm of mdoel's parameters
parameters:
model's parameters
norm_type: str
Returns
------------
Tensor
grad_norm
'''
grad_list = [
paddle.to_tensor(p.grad) for p in parameters if p.grad is not None
]
norm_list = paddle.stack(
[paddle.norm(grad, norm_type) for grad in grad_list])
total_norm = paddle.norm(norm_list)
return total_norm
# for save name in gen_valid_samples()
ITERATION = 0
class WaveRNNUpdater(StandardUpdater):
def __init__(self,
model: Layer,
optimizer: Optimizer,
criterion: Layer,
dataloader: DataLoader,
init_state=None,
output_dir: Path=None,
mode='RAW'):
super().__init__(model, optimizer, dataloader, init_state=None)
self.criterion = criterion
# self.scheduler = scheduler
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
logger.addHandler(self.filehandler)
self.logger = logger
self.msg = ""
self.mode = mode
def update_core(self, batch):
self.msg = "Rank: {}, ".format(dist.get_rank())
losses_dict = {}
# parse batch
self.model.train()
self.optimizer.clear_grad()
wav, y, mel = batch
y_hat = self.model(wav, mel)
if self.mode == 'RAW':
y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
elif self.mode == 'MOL':
y_hat = paddle.cast(y, dtype='float32')
y = y.unsqueeze(-1)
loss = self.criterion(y_hat, y)
loss.backward()
grad_norm = float(
calculate_grad_norm(self.model.parameters(), norm_type=2))
self.optimizer.step()
report("train/loss", float(loss))
report("train/grad_norm", float(grad_norm))
losses_dict["loss"] = float(loss)
losses_dict["grad_norm"] = float(grad_norm)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
global ITERATION
ITERATION = self.state.iteration + 1
class WaveRNNEvaluator(StandardEvaluator):
def __init__(self,
model: Layer,
criterion: Layer,
dataloader: Optimizer,
output_dir: Path=None,
valid_generate_loader=None,
config=None):
super().__init__(model, dataloader)
log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
self.filehandler = logging.FileHandler(str(log_file))
logger.addHandler(self.filehandler)
self.logger = logger
self.msg = ""
self.criterion = criterion
self.valid_generate_loader = valid_generate_loader
self.config = config
self.mode = config.model.mode
self.valid_samples_dir = output_dir / "valid_samples"
self.valid_samples_dir.mkdir(parents=True, exist_ok=True)
def evaluate_core(self, batch):
self.msg = "Evaluate: "
losses_dict = {}
# parse batch
wav, y, mel = batch
y_hat = self.model(wav, mel)
if self.mode == 'RAW':
y_hat = y_hat.transpose([0, 2, 1]).unsqueeze(-1)
elif self.mode == 'MOL':
y_hat = paddle.cast(y, dtype='float32')
y = y.unsqueeze(-1)
loss = self.criterion(y_hat, y)
report("eval/loss", float(loss))
losses_dict["loss"] = float(loss)
self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_dict.items())
self.logger.info(self.msg)
def gen_valid_samples(self):
for i, item in enumerate(self.valid_generate_loader):
if i >= self.config.generate_num:
break
print(
'\n| Generating: {}/{}'.format(i + 1, self.config.generate_num))
mel = item['feats']
wav = item['wave']
wav = wav.squeeze(0)
origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format(
self.iteration, i)
sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs)
if self.config.inference.gen_batched:
batch_str = 'gen_batched_target{}_overlap{}'.format(
self.config.inference.target, self.config.inference.overlap)
else:
batch_str = 'gen_not_batched'
gen_save_path = str(self.valid_samples_dir /
'{}_steps_{}_{}.wav'.format(self.iteration, i,
batch_str))
# (1, T, C_aux) -> (T, C_aux)
mel = mel.squeeze(0)
gen_sample = self.model.generate(
mel, self.config.inference.gen_batched,
self.config.inference.target, self.config.inference.overlap,
self.config.mu_law)
sf.write(
gen_save_path, gen_sample.numpy(), samplerate=self.config.fs)
def __call__(self, trainer=None):
summary = self.evaluate()
for k, v in summary.items():
report(k, v)
# gen samples at then end of evaluate
self.iteration = ITERATION
if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
self.gen_valid_samples()
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import math import math
import librosa import librosa
import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
from paddle.fluid.layers import sequence_mask from paddle.fluid.layers import sequence_mask
...@@ -23,6 +24,145 @@ from scipy import signal ...@@ -23,6 +24,145 @@ from scipy import signal
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
# Losses for WaveRNN
def log_sum_exp(x):
""" numerically stable log_sum_exp implementation that prevents overflow """
# TF ordering
axis = len(x.shape) - 1
m = paddle.max(x, axis=axis)
m2 = paddle.max(x, axis=axis, keepdim=True)
return m + paddle.log(paddle.sum(paddle.exp(x - m2), axis=axis))
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
def discretized_mix_logistic_loss(y_hat,
y,
num_classes=65536,
log_scale_min=None,
reduce=True):
if log_scale_min is None:
log_scale_min = float(np.log(1e-14))
y_hat = y_hat.transpose([0, 2, 1])
assert y_hat.dim() == 3
assert y_hat.shape[1] % 3 == 0
nr_mix = y_hat.shape[1] // 3
# (B x T x C)
y_hat = y_hat.transpose([0, 2, 1])
# unpack parameters. (B, T, num_mixtures) x 3
logit_probs = y_hat[:, :, :nr_mix]
means = y_hat[:, :, nr_mix:2 * nr_mix]
log_scales = paddle.clip(
y_hat[:, :, 2 * nr_mix:3 * nr_mix], min=log_scale_min)
# B x T x 1 -> B x T x num_mixtures
y = y.expand_as(means)
centered_y = paddle.cast(y, dtype=paddle.get_default_dtype()) - means
inv_stdv = paddle.exp(-log_scales)
plus_in = inv_stdv * (centered_y + 1. / (num_classes - 1))
cdf_plus = F.sigmoid(plus_in)
min_in = inv_stdv * (centered_y - 1. / (num_classes - 1))
cdf_min = F.sigmoid(min_in)
# log probability for edge case of 0 (before scaling)
# equivalent: torch.log(F.sigmoid(plus_in))
# softplus: log(1+ e^{-x})
log_cdf_plus = plus_in - F.softplus(plus_in)
# log probability for edge case of 255 (before scaling)
# equivalent: (1 - F.sigmoid(min_in)).log()
log_one_minus_cdf_min = -F.softplus(min_in)
# probability for all other cases
cdf_delta = cdf_plus - cdf_min
mid_in = inv_stdv * centered_y
# log probability in the center of the bin, to be used in extreme cases
# (not actually used in our code)
log_pdf_mid = mid_in - log_scales - 2. * F.softplus(mid_in)
# TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
# for num_classes=65536 case? 1e-7? not sure..
inner_inner_cond = cdf_delta > 1e-5
inner_inner_cond = paddle.cast(
inner_inner_cond, dtype=paddle.get_default_dtype())
# inner_inner_out = inner_inner_cond * \
# paddle.log(paddle.clip(cdf_delta, min=1e-12)) + \
# (1. - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
inner_inner_out = inner_inner_cond * paddle.log(
paddle.clip(cdf_delta, min=1e-12)) + (1. - inner_inner_cond) * (
log_pdf_mid - np.log((num_classes - 1) / 2))
inner_cond = y > 0.999
inner_cond = paddle.cast(inner_cond, dtype=paddle.get_default_dtype())
inner_out = inner_cond * log_one_minus_cdf_min + (1. - inner_cond
) * inner_inner_out
cond = y < -0.999
cond = paddle.cast(cond, dtype=paddle.get_default_dtype())
log_probs = cond * log_cdf_plus + (1. - cond) * inner_out
log_probs = log_probs + F.log_softmax(logit_probs, -1)
if reduce:
return -paddle.mean(log_sum_exp(log_probs))
else:
return -log_sum_exp(log_probs).unsqueeze(-1)
def sample_from_discretized_mix_logistic(y, log_scale_min=None):
"""
Sample from discretized mixture of logistic distributions
Parameters
----------
y : Tensor
(B, C, T)
log_scale_min : float
Log scale minimum value
Returns
----------
Tensor
sample in range of [-1, 1].
"""
if log_scale_min is None:
log_scale_min = float(np.log(1e-14))
assert y.shape[1] % 3 == 0
nr_mix = y.shape[1] // 3
# (B, T, C)
y = y.transpose([0, 2, 1])
logit_probs = y[:, :, :nr_mix]
# sample mixture indicator from softmax
temp = paddle.uniform(
logit_probs.shape, dtype=logit_probs.dtype, min=1e-5, max=1.0 - 1e-5)
temp = logit_probs - paddle.log(-paddle.log(temp))
argmax = paddle.argmax(temp, axis=-1)
# (B, T) -> (B, T, nr_mix)
one_hot = F.one_hot(argmax, nr_mix)
one_hot = paddle.cast(one_hot, dtype=paddle.get_default_dtype())
# select logistic parameters
means = paddle.sum(y[:, :, nr_mix:2 * nr_mix] * one_hot, axis=-1)
log_scales = paddle.clip(
paddle.sum(y[:, :, 2 * nr_mix:3 * nr_mix] * one_hot, axis=-1),
min=log_scale_min)
# sample from logistic & clip to interval
# we don't actually round to the nearest 8bit value when sampling
u = paddle.uniform(means.shape, min=1e-5, max=1.0 - 1e-5)
x = means + paddle.exp(log_scales) * (paddle.log(u) - paddle.log(1. - u))
x = paddle.clip(x, min=-1., max=-1.)
return x
# Loss for new Tacotron2 # Loss for new Tacotron2
class GuidedAttentionLoss(nn.Layer): class GuidedAttentionLoss(nn.Layer):
"""Guided attention loss function module. """Guided attention loss function module.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册