提交 04d7f8b5 编写于 作者: L lifuchen 提交者: chenfeiyu

transform parse to argparse

上级 f5ac04b1
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2,
help="the attention head number in encoder.")
parser.add_argument('--encoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.")
parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.")
parser.add_argument('--fs_hidden_size', type=int, default=256,
help="the hidden size in model of fastspeech.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
help="the filter size of conv1d in duration prediction.")
parser.add_argument('--fft_conv1d_filter', type=int, default=3,
help="the filter size of conv1d in fft.")
parser.add_argument('--fft_conv1d_padding', type=int, default=1,
help="the padding size of conv1d in fft.")
parser.add_argument('--dropout', type=float, default=0.1,
help="the dropout in network.")
parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.")
parser.add_argument('--alpha', type=float, default=1.0,
help="the hyperparameter to determine the length of the expanded sequence\
mel, thereby controlling the voice speed.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model of transformerTTS.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=160000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=70000,
help="the step to load transformerTTS model.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
hidden_size: 256
embedding_size: 512
warm_up_step: 4000
grad_clip_thresh: 1.0
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 1000
image_step: 2000
use_gpu: True
use_data_parallel: False
stop_token: False
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#ransformer_step: 97000
\ No newline at end of file
...@@ -10,37 +10,23 @@ audio: ...@@ -10,37 +10,23 @@ audio:
ref_level_db: 20 #the reference level db. ref_level_db: 20 #the reference level db.
outputs_per_step: 1 #the outputs per step. outputs_per_step: 1 #the outputs per step.
encoder_n_layer: 6 encoder_n_layer: 6 #the number of FFT Block in encoder.
encoder_head: 2 encoder_head: 2 #the attention head number in encoder.
encoder_conv1d_filter_size: 1536 encoder_conv1d_filter_size: 1536 #the filter size of conv1d in encoder.
max_sep_len: 2048 max_seq_len: 2048 #the max length of sequence.
decoder_n_layer: 6 decoder_n_layer: 6 #the number of FFT Block in decoder.
decoder_head: 2 decoder_head: 2 #the attention head number in decoder.
decoder_conv1d_filter_size: 1536 decoder_conv1d_filter_size: 1536 #the filter size of conv1d in decoder.
fs_hidden_size: 384 fs_hidden_size: 384 #the hidden size in model of fastspeech.
duration_predictor_output_size: 256 duration_predictor_output_size: 256 #the output size of duration predictior.
duration_predictor_filter_size: 3 duration_predictor_filter_size: 3 #the filter size of conv1d in duration prediction.
fft_conv1d_filter: 3 fft_conv1d_filter: 3 #the filter size of conv1d in fft.
fft_conv1d_padding: 1 fft_conv1d_padding: 1 #the padding size of conv1d in fft.
dropout: 0.1 dropout: 0.1 #the dropout in network.
transformer_head: 4 transformer_head: 4 #the attention head num of transformerTTS.
embedding_size: 512 embedding_size: 512 #the dim size of embedding of transformerTTS.
hidden_size: 256 hidden_size: 256 #the hidden size in model of transformerTTS.
warm_up_step: 4000 #the warm up step of learning rate.
grad_clip_thresh: 0.1 #the threshold of grad clip.
warm_up_step: 4000
grad_clip_thresh: 0.1
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: True
data_path: ../../dataset/LJSpeech-1.1
transtts_path: ../TransformerTTS/checkpoint/
transformer_step: 160000
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#transformer_step: 97000
...@@ -13,7 +13,7 @@ audio: ...@@ -13,7 +13,7 @@ audio:
encoder_n_layer: 6 encoder_n_layer: 6
encoder_head: 2 encoder_head: 2
encoder_conv1d_filter_size: 1536 encoder_conv1d_filter_size: 1536
max_sep_len: 2048 max_seq_len: 2048
decoder_n_layer: 6 decoder_n_layer: 6
decoder_head: 2 decoder_head: 2
decoder_conv1d_filter_size: 1536 decoder_conv1d_filter_size: 1536
...@@ -23,11 +23,4 @@ duration_predictor_filter_size: 3 ...@@ -23,11 +23,4 @@ duration_predictor_filter_size: 3
fft_conv1d_filter: 3 fft_conv1d_filter: 3
fft_conv1d_padding: 1 fft_conv1d_padding: 1
dropout: 0.1 dropout: 0.1
transformer_head: 4 transformer_head: 4
\ No newline at end of file
use_gpu: True
alpha: 1.0
checkpoint_path: checkpoint/
fastspeech_step: 71000
log_dir: ./log
\ No newline at end of file
import argparse
def add_config_options_to_parser(parser):
parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
help="the yaml config file path.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--fastspeech_step', type=int, default=70000,
help="Global step to restore checkpoint of fastspeech.")
parser.add_argument('--use_gpu', type=int, default=1,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=int, default=0,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.")
parser.add_argument('--transtts_path', type=str, default='./log',
help="the directory to load pretrain transformerTTS model.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="the step to load transformerTTS model.")
import os import os
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from collections import OrderedDict from collections import OrderedDict
import jsonargparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
from parakeet import audio from parakeet import audio
from network import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
...@@ -21,19 +22,22 @@ def load_checkpoint(step, model_path): ...@@ -21,19 +22,22 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, cfg): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
# tensorboard # tensorboard
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'synthesis') path = os.path.join(args.log_dir,'synthesis')
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.set_dict(load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech"))) model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
model.eval() model.eval()
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
...@@ -41,18 +45,18 @@ def synthesis(text_input, cfg): ...@@ -41,18 +45,18 @@ def synthesis(text_input, cfg):
pos_text = np.arange(1, text.shape[1]+1) pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
mel_output, mel_output_postnet = model(text, pos_text, alpha=cfg.alpha) mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg.audio.sr, sample_rate=cfg['audio']['sr'],
num_mels=cfg.audio.num_mels, num_mels=cfg['audio']['num_mels'],
min_level_db=cfg.audio.min_level_db, min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg.audio.ref_level_db, ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg.audio.n_fft, n_fft=cfg['audio']['n_fft'],
win_length= cfg.audio.win_length, win_length= cfg['audio']['win_length'],
hop_length= cfg.audio.hop_length, hop_length= cfg['audio']['hop_length'],
power=cfg.audio.power, power=cfg['audio']['power'],
preemphasis=cfg.audio.preemphasis, preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
symmetric_norm=False, symmetric_norm=False,
max_norm=1., max_norm=1.,
...@@ -65,12 +69,12 @@ def synthesis(text_input, cfg): ...@@ -65,12 +69,12 @@ def synthesis(text_input, cfg):
mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0]) mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy()) wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
print("Synthesis completed !!!") print("Synthesis completed !!!")
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) args = parser.parse_args()
synthesis("Transformer model is so fast!", cfg) synthesis("Transformer model is so fast!", args)
\ No newline at end of file \ No newline at end of file
...@@ -3,10 +3,10 @@ import argparse ...@@ -3,10 +3,10 @@ import argparse
import os import os
import time import time
import math import math
import jsonargparse
from pathlib import Path from pathlib import Path
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml
from tqdm import tqdm from tqdm import tqdm
from collections import OrderedDict from collections import OrderedDict
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
...@@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg ...@@ -14,7 +14,7 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.dataloader.ljspeech import LJSpeechLoader from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
from parakeet.models.fastspeech.fastspeech import FastSpeech from parakeet.models.fastspeech.fastspeech import FastSpeech
from parakeet.models.fastspeech.utils import get_alignment from parakeet.models.fastspeech.utils import get_alignment
...@@ -28,50 +28,49 @@ def load_checkpoint(step, model_path): ...@@ -28,50 +28,49 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(cfg): def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0: with open(args.config_path) as f:
# Print the whole config setting. cfg = yaml.load(f, Loader=yaml.Loader)
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0 global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'fastspeech') path = os.path.join(args.log_dir,'fastspeech')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
transformerTTS = TransformerTTS(cfg) transformerTTS = TransformerTTS(cfg)
model_dict, _ = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.transtts_path, "transformer")) model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
transformerTTS.set_dict(model_dict) transformerTTS.set_dict(model_dict)
transformerTTS.eval() transformerTTS.eval()
model = FastSpeech(cfg) model = FastSpeech(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters()) parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.fastspeech_step), os.path.join(cfg.checkpoint_path, "fastspeech")) model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = cfg.fastspeech_step global_step = args.fastspeech_step
print("load checkpoint!!!") print("load checkpoint!!!")
if cfg.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(cfg.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
...@@ -79,7 +78,7 @@ def main(cfg): ...@@ -79,7 +78,7 @@ def main(cfg):
character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
_, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel) _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg.transformer_head)).astype(np.float32) alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
global_step += 1 global_step += 1
...@@ -101,20 +100,20 @@ def main(cfg): ...@@ -101,20 +100,20 @@ def main(cfg):
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if cfg.use_data_parallel: if args.use_data_parallel:
total_loss = model.scale_loss(total_loss) total_loss = model.scale_loss(total_loss)
total_loss.backward() total_loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
total_loss.backward() total_loss.backward()
optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0: if local_rank==0 and global_step % args.save_step == 0:
if not os.path.exists(cfg.save_path): if not os.path.exists(args.save_path):
os.mkdir(cfg.save_path) os.mkdir(args.save_path)
save_path = os.path.join(cfg.save_path,'fastspeech/%d' % global_step) save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank==0:
...@@ -122,7 +121,9 @@ def main(cfg): ...@@ -122,7 +121,9 @@ def main(cfg):
if __name__ =='__main__': if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train Fastspeech model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train Fastspeech model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c config/fastspeech.yaml'.split()) args = parser.parse_args()
main(cfg) # Print the whole config setting.
pprint(args)
main(args)
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
\ No newline at end of file
...@@ -10,11 +10,11 @@ audio: ...@@ -10,11 +10,11 @@ audio:
ref_level_db: 20 ref_level_db: 20
outputs_per_step: 1 outputs_per_step: 1
max_len: 50
transformer_step: 10
postnet_step: 10
use_gpu: True
checkpoint_path: ./checkpoint hidden_size: 256
log_dir: ./log embedding_size: 512
sample_path: ./sample warm_up_step: 4000
\ No newline at end of file grad_clip_thresh: 1.0
\ No newline at end of file
...@@ -12,18 +12,5 @@ audio: ...@@ -12,18 +12,5 @@ audio:
hidden_size: 256 hidden_size: 256
embedding_size: 512 embedding_size: 512
warm_up_step: 4000 warm_up_step: 4000
grad_clip_thresh: 1.0 grad_clip_thresh: 1.0
batch_size: 32 \ No newline at end of file
epochs: 10000
lr: 0.001
save_step: 10
use_gpu: True
use_data_parallel: True
data_path: ../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
#checkpoint_path: ./checkpoint
#transformer_step: 27000
\ No newline at end of file
import jsonargparse import argparse
def add_config_options_to_parser(parser): def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80, parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
help="the number of mel bands when calculating mel spectrograms.") help="the yaml config file path.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=int, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=int, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in network.")
parser.add_argument('--embedding_size', type=int, default=512,
help="the embedding vector size.")
parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
help="the threshold of grad clip.")
parser.add_argument('--batch_size', type=int, default=32, parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.") help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000, parser.add_argument('--epochs', type=int, default=10000,
...@@ -45,13 +17,13 @@ def add_config_options_to_parser(parser): ...@@ -45,13 +17,13 @@ def add_config_options_to_parser(parser):
help="The max length of audio when synthsis.") help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000, parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer.") help="Global step to restore checkpoint of transformer.")
parser.add_argument('--postnet_step', type=int, default=90000, parser.add_argument('--vocoder_step', type=int, default=90000,
help="Global step to restore checkpoint of postnet.") help="Global step to restore checkpoint of postnet.")
parser.add_argument('--use_gpu', type=bool, default=True, parser.add_argument('--use_gpu', type=int, default=1,
help="use gpu or not during training.") help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False, parser.add_argument('--use_data_parallel', type=int, default=0,
help="use data parallel or not during training.") help="use data parallel or not during training.")
parser.add_argument('--stop_token', type=bool, default=False, parser.add_argument('--stop_token', type=int, default=0,
help="use stop token loss in network or not.") help="use stop token loss in network or not.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1', parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
...@@ -62,8 +34,5 @@ def add_config_options_to_parser(parser): ...@@ -62,8 +34,5 @@ def add_config_options_to_parser(parser):
help="the path to save checkpoint.") help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log', parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.") help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./log', parser.add_argument('--sample_path', type=str, default='./sample',
help="the directory to save audio sample in synthesis.") help="the directory to save audio sample in synthesis.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
...@@ -2,17 +2,19 @@ import os ...@@ -2,17 +2,19 @@ import os
from scipy.io.wavfile import write from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
import numpy as np import numpy as np
from network import TransformerTTS, ModelPostNet
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from ruamel import yaml
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
from pathlib import Path from pathlib import Path
import jsonargparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from collections import OrderedDict from collections import OrderedDict
from parakeet import audio from parakeet import audio
from parakeet.models.transformer_tts.vocoder import Vocoder
from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
...@@ -24,25 +26,28 @@ def load_checkpoint(step, model_path): ...@@ -24,25 +26,28 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict return new_state_dict
def synthesis(text_input, cfg): def synthesis(text_input, args):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()) place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
with open(args.config_path) as f:
cfg = yaml.load(f, Loader=yaml.Loader)
# tensorboard # tensorboard
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'synthesis') path = os.path.join(args.log_dir,'synthesis')
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "nostop_token/transformer"))) model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "nostop_token/transformer")))
model.eval() model.eval()
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model_postnet = ModelPostNet(cfg) model_postnet = Vocoder(cfg, args.batch_size)
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet"))) model_postnet.set_dict(load_checkpoint(str(args.postnet_step), os.path.join(args.checkpoint_path, "postnet")))
model_postnet.eval() model_postnet.eval()
# init input # init input
text = np.asarray(text_to_sequence(text_input)) text = np.asarray(text_to_sequence(text_input))
...@@ -52,7 +57,7 @@ def synthesis(text_input, cfg): ...@@ -52,7 +57,7 @@ def synthesis(text_input, cfg):
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0]) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
pbar = tqdm(range(cfg.max_len)) pbar = tqdm(range(args.max_len))
for i in pbar: for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1) pos_mel = np.arange(1, mel_input.shape[1]+1)
...@@ -62,15 +67,15 @@ def synthesis(text_input, cfg): ...@@ -62,15 +67,15 @@ def synthesis(text_input, cfg):
mag_pred = model_postnet(postnet_pred) mag_pred = model_postnet(postnet_pred)
_ljspeech_processor = audio.AudioProcessor( _ljspeech_processor = audio.AudioProcessor(
sample_rate=cfg.audio.sr, sample_rate=cfg['audio']['sr'],
num_mels=cfg.audio.num_mels, num_mels=cfg['audio']['num_mels'],
min_level_db=cfg.audio.min_level_db, min_level_db=cfg['audio']['min_level_db'],
ref_level_db=cfg.audio.ref_level_db, ref_level_db=cfg['audio']['ref_level_db'],
n_fft=cfg.audio.n_fft, n_fft=cfg['audio']['n_fft'],
win_length= cfg.audio.win_length, win_length= cfg['audio']['win_length'],
hop_length= cfg.audio.hop_length, hop_length= cfg['audio']['hop_length'],
power=cfg.audio.power, power=cfg['audio']['power'],
preemphasis=cfg.audio.preemphasis, preemphasis=cfg['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
symmetric_norm=False, symmetric_norm=False,
max_norm=1., max_norm=1.,
...@@ -82,14 +87,14 @@ def synthesis(text_input, cfg): ...@@ -82,14 +87,14 @@ def synthesis(text_input, cfg):
sound_norm=False) sound_norm=False)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy()) wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr) writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
if not os.path.exists(cfg.sample_path): if not os.path.exists(args.sample_path):
os.mkdir(cfg.sample_path) os.mkdir(args.sample_path)
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav) write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Synthesis model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split()) args = parser.parse_args()
synthesis("Transformer model is so fast!", cfg) synthesis("Transformer model is so fast!", args)
...@@ -3,9 +3,10 @@ from tqdm import tqdm ...@@ -3,9 +3,10 @@ from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from pathlib import Path from pathlib import Path
from collections import OrderedDict from collections import OrderedDict
import jsonargparse import argparse
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
from ruamel import yaml
from matplotlib import cm from matplotlib import cm
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg ...@@ -13,7 +14,7 @@ import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.utils import cross_entropy from parakeet.modules.utils import cross_entropy
from parakeet.models.dataloader.ljspeech import LJSpeechLoader from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.transformerTTS import TransformerTTS from parakeet.models.transformer_tts.transformerTTS import TransformerTTS
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
...@@ -26,22 +27,21 @@ def load_checkpoint(step, model_path): ...@@ -26,22 +27,21 @@ def load_checkpoint(step, model_path):
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(cfg): def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0: with open(args.config_path) as f:
# Print the whole config setting. cfg = yaml.load(f, Loader=yaml.Loader)
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0 global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'transformer') path = os.path.join(args.log_dir,'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
...@@ -49,23 +49,23 @@ def main(cfg): ...@@ -49,23 +49,23 @@ def main(cfg):
model = TransformerTTS(cfg) model = TransformerTTS(cfg)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters()) parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader() reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")) model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = cfg.transformer_step global_step = args.transformer_step
print("load checkpoint!!!") print("load checkpoint!!!")
if cfg.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch in range(cfg.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d'%epoch)
...@@ -81,7 +81,7 @@ def main(cfg): ...@@ -81,7 +81,7 @@ def main(cfg):
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss loss = mel_loss + post_mel_loss
# Note: When used stop token loss the learning did not work. # Note: When used stop token loss the learning did not work.
if cfg.stop_token: if args.stop_token:
stop_loss = cross_entropy(stop_preds, label) stop_loss = cross_entropy(stop_preds, label)
loss = loss + stop_loss loss = loss + stop_loss
...@@ -91,7 +91,7 @@ def main(cfg): ...@@ -91,7 +91,7 @@ def main(cfg):
'post_mel_loss':post_mel_loss.numpy() 'post_mel_loss':post_mel_loss.numpy()
}, global_step) }, global_step)
if cfg.stop_token: if args.stop_token:
writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
writer.add_scalars('alphas', { writer.add_scalars('alphas', {
...@@ -101,7 +101,7 @@ def main(cfg): ...@@ -101,7 +101,7 @@ def main(cfg):
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if global_step % cfg.image_step == 1: if global_step % args.image_step == 1:
for i, prob in enumerate(attn_probs): for i, prob in enumerate(attn_probs):
for j in range(4): for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
...@@ -117,20 +117,20 @@ def main(cfg): ...@@ -117,20 +117,20 @@ def main(cfg):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255) x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC") writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
if cfg.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
# save checkpoint # save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0: if local_rank==0 and global_step % args.save_step == 0:
if not os.path.exists(cfg.save_path): if not os.path.exists(args.save_path):
os.mkdir(cfg.save_path) os.mkdir(args.save_path)
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step) save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0: if local_rank==0:
...@@ -138,7 +138,10 @@ def main(cfg): ...@@ -138,7 +138,10 @@ def main(cfg):
if __name__ =='__main__': if __name__ =='__main__':
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train TransformerTTS model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
main(cfg) args = parser.parse_args()
\ No newline at end of file # Print the whole config setting.
pprint(args)
main(args)
\ No newline at end of file
...@@ -3,14 +3,15 @@ import os ...@@ -3,14 +3,15 @@ import os
from tqdm import tqdm from tqdm import tqdm
from pathlib import Path from pathlib import Path
from collections import OrderedDict from collections import OrderedDict
import jsonargparse import argparse
from ruamel import yaml
from parse import add_config_options_to_parser from parse import add_config_options_to_parser
from pprint import pprint from pprint import pprint
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.models.dataloader.ljspeech import LJSpeechLoader from parakeet.models.dataloader.ljspeech import LJSpeechLoader
from parakeet.models.transformerTTS.vocoder import Vocoder from parakeet.models.transformer_tts.vocoder import Vocoder
def load_checkpoint(step, model_path): def load_checkpoint(step, model_path):
model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step)) model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
...@@ -22,48 +23,47 @@ def load_checkpoint(step, model_path): ...@@ -22,48 +23,47 @@ def load_checkpoint(step, model_path):
new_state_dict[param] = model_dict[param] new_state_dict[param] = model_dict[param]
return new_state_dict, opti_dict return new_state_dict, opti_dict
def main(cfg): def main(args):
local_rank = dg.parallel.Env().local_rank if cfg.use_data_parallel else 0 local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
nranks = dg.parallel.Env().nranks if cfg.use_data_parallel else 1 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
if local_rank == 0: with open(args.config_path) as f:
# Print the whole config setting. cfg = yaml.load(f, Loader=yaml.Loader)
pprint(jsonargparse.namespace_to_dict(cfg))
global_step = 0 global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0) if args.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace()) if args.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir): if not os.path.exists(args.log_dir):
os.mkdir(cfg.log_dir) os.mkdir(args.log_dir)
path = os.path.join(cfg.log_dir,'postnet') path = os.path.join(args.log_dir,'postnet')
writer = SummaryWriter(path) if local_rank == 0 else None writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place): with dg.guard(place):
model = Vocoder(cfg) model = Vocoder(cfg, args.batch_size)
model.train() model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
parameter_list=model.parameters()) parameter_list=model.parameters())
if cfg.checkpoint_path is not None: if args.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")) model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "postnet"))
model.set_dict(model_dict) model.set_dict(model_dict)
optimizer.set_dict(opti_dict) optimizer.set_dict(opti_dict)
global_step = cfg.postnet_step global_step = args.vocoder_step
print("load checkpoint!!!") print("load checkpoint!!!")
if cfg.use_data_parallel: if args.use_data_parallel:
strategy = dg.parallel.prepare_context() strategy = dg.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy) model = fluid.dygraph.parallel.DataParallel(model, strategy)
reader = LJSpeechLoader(cfg, nranks, local_rank, is_vocoder=True).reader() reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
for epoch in range(cfg.epochs): for epoch in range(args.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d'%epoch)
...@@ -75,13 +75,13 @@ def main(cfg): ...@@ -75,13 +75,13 @@ def main(cfg):
mag_pred = model(mel) mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag))) loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel: if args.use_data_parallel:
loss = model.scale_loss(loss) loss = model.scale_loss(loss)
loss.backward() loss.backward()
model.apply_collective_grads() model.apply_collective_grads()
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
model.clear_gradients() model.clear_gradients()
if local_rank==0: if local_rank==0:
...@@ -89,10 +89,10 @@ def main(cfg): ...@@ -89,10 +89,10 @@ def main(cfg):
'loss':loss.numpy(), 'loss':loss.numpy(),
}, global_step) }, global_step)
if global_step % cfg.save_step == 0: if global_step % args.save_step == 0:
if not os.path.exists(cfg.save_path): if not os.path.exists(args.save_path):
os.mkdir(cfg.save_path) os.mkdir(args.save_path)
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step) save_path = os.path.join(args.save_path,'postnet/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path)
...@@ -100,7 +100,9 @@ def main(cfg): ...@@ -100,7 +100,9 @@ def main(cfg):
writer.close() writer.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse') parser = argparse.ArgumentParser(description="Train postnet model")
add_config_options_to_parser(parser) add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_vocoder.yaml'.split()) args = parser.parse_args()
main(cfg) # Print the whole config setting.
\ No newline at end of file pprint(args)
main(args)
\ No newline at end of file
...@@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher ...@@ -13,17 +13,17 @@ from parakeet.data.batch import TextIDBatcher, SpecBatcher
from parakeet.data.dataset import DatasetMixin, TransformDataset from parakeet.data.dataset import DatasetMixin, TransformDataset
class LJSpeechLoader: class LJSpeechLoader:
def __init__(self, config, nranks, rank, is_vocoder=False, shuffle=True): def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
place = fluid.CUDAPlace(rank) if config.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
LJSPEECH_ROOT = Path(config.data_path) LJSPEECH_ROOT = Path(args.data_path)
metadata = LJSpeechMetaData(LJSPEECH_ROOT) metadata = LJSpeechMetaData(LJSPEECH_ROOT)
transformer = LJSpeech(config) transformer = LJSpeech(config)
dataset = TransformDataset(metadata, transformer) dataset = TransformDataset(metadata, transformer)
sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle) sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
assert config.batch_size % nranks == 0 assert args.batch_size % nranks == 0
each_bs = config.batch_size // nranks each_bs = args.batch_size // nranks
if is_vocoder: if is_vocoder:
dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True) dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
else: else:
...@@ -63,15 +63,15 @@ class LJSpeech(object): ...@@ -63,15 +63,15 @@ class LJSpeech(object):
super(LJSpeech, self).__init__() super(LJSpeech, self).__init__()
self.config = config self.config = config
self._ljspeech_processor = audio.AudioProcessor( self._ljspeech_processor = audio.AudioProcessor(
sample_rate=config.audio.sr, sample_rate=config['audio']['sr'],
num_mels=config.audio.num_mels, num_mels=config['audio']['num_mels'],
min_level_db=config.audio.min_level_db, min_level_db=config['audio']['min_level_db'],
ref_level_db=config.audio.ref_level_db, ref_level_db=config['audio']['ref_level_db'],
n_fft=config.audio.n_fft, n_fft=config['audio']['n_fft'],
win_length= config.audio.win_length, win_length= config['audio']['win_length'],
hop_length= config.audio.hop_length, hop_length= config['audio']['hop_length'],
power=config.audio.power, power=config['audio']['power'],
preemphasis=config.audio.preemphasis, preemphasis=config['audio']['preemphasis'],
signal_norm=True, signal_norm=True,
symmetric_norm=False, symmetric_norm=False,
max_norm=1., max_norm=1.,
......
...@@ -2,7 +2,7 @@ import math ...@@ -2,7 +2,7 @@ import math
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
from parakeet.models.transformerTTS.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
from parakeet.models.fastspeech.LengthRegulator import LengthRegulator from parakeet.models.fastspeech.LengthRegulator import LengthRegulator
from parakeet.models.fastspeech.encoder import Encoder from parakeet.models.fastspeech.encoder import Encoder
from parakeet.models.fastspeech.decoder import Decoder from parakeet.models.fastspeech.decoder import Decoder
...@@ -13,43 +13,43 @@ class FastSpeech(dg.Layer): ...@@ -13,43 +13,43 @@ class FastSpeech(dg.Layer):
super(FastSpeech, self).__init__() super(FastSpeech, self).__init__()
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len, len_max_seq=cfg['max_seq_len'],
n_layers=cfg.encoder_n_layer, n_layers=cfg['encoder_n_layer'],
n_head=cfg.encoder_head, n_head=cfg['encoder_head'],
d_k=cfg.fs_hidden_size // cfg.encoder_head, d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_v=cfg.fs_hidden_size // cfg.encoder_head, d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
d_model=cfg.fs_hidden_size, d_model=cfg['fs_hidden_size'],
d_inner=cfg.encoder_conv1d_filter_size, d_inner=cfg['encoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1) dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size, self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'],
out_channels=cfg.duration_predictor_output_size, out_channels=cfg['duration_predictor_output_size'],
filter_size=cfg.duration_predictor_filter_size, filter_size=cfg['duration_predictor_filter_size'],
dropout=cfg.dropout) dropout=cfg['dropout'])
self.decoder = Decoder(len_max_seq=cfg.max_sep_len, self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
n_layers=cfg.decoder_n_layer, n_layers=cfg['decoder_n_layer'],
n_head=cfg.decoder_head, n_head=cfg['decoder_head'],
d_k=cfg.fs_hidden_size // cfg.decoder_head, d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_v=cfg.fs_hidden_size // cfg.decoder_head, d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
d_model=cfg.fs_hidden_size, d_model=cfg['fs_hidden_size'],
d_inner=cfg.decoder_conv1d_filter_size, d_inner=cfg['decoder_conv1d_filter_size'],
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg['fft_conv1d_filter'],
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg['fft_conv1d_padding'],
dropout=0.1) dropout=0.1)
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()) self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
k = math.sqrt(1 / cfg.fs_hidden_size) k = math.sqrt(1 / cfg['fs_hidden_size'])
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)) self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.mel_linear = dg.Linear(cfg.fs_hidden_size, self.mel_linear = dg.Linear(cfg['fs_hidden_size'],
cfg.audio.num_mels * cfg.audio.outputs_per_step, cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
param_attr = self.weight, param_attr = self.weight,
bias_attr = self.bias,) bias_attr = self.bias,)
self.postnet = PostConvNet(n_mels=cfg.audio.num_mels, self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
num_hidden=512, num_hidden=512,
filter_size=5, filter_size=5,
padding=int(5 / 2), padding=int(5 / 2),
num_conv=5, num_conv=5,
outputs_per_step=cfg.audio.outputs_per_step, outputs_per_step=cfg['audio']['outputs_per_step'],
use_cudnn=True, use_cudnn=True,
dropout=0.1, dropout=0.1,
batchnorm_last=True) batchnorm_last=True)
......
...@@ -4,8 +4,8 @@ import paddle.fluid as fluid ...@@ -4,8 +4,8 @@ import paddle.fluid as fluid
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformerTTS.prenet import PreNet from parakeet.models.transformer_tts.prenet import PreNet
from parakeet.models.transformerTTS.post_convnet import PostConvNet from parakeet.models.transformer_tts.post_convnet import PostConvNet
class Decoder(dg.Layer): class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4): def __init__(self, num_hidden, config, num_head=4):
...@@ -20,7 +20,7 @@ class Decoder(dg.Layer): ...@@ -20,7 +20,7 @@ class Decoder(dg.Layer):
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False)) trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels, self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'],
hidden_size = num_hidden * 2, hidden_size = num_hidden * 2,
output_size = num_hidden, output_size = num_hidden,
dropout_rate=0.2) dropout_rate=0.2)
...@@ -38,17 +38,17 @@ class Decoder(dg.Layer): ...@@ -38,17 +38,17 @@ class Decoder(dg.Layer):
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step, self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.stop_linear = dg.Linear(num_hidden, 1, self.stop_linear = dg.Linear(num_hidden, 1,
param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()), param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))) bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'],
filter_size = 5, padding = 4, num_conv=5, filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step, outputs_per_step=config['audio']['outputs_per_step'],
use_cudnn = config.use_gpu) use_cudnn = True)
def forward(self, key, value, query, c_mask, positional): def forward(self, key, value, query, c_mask, positional):
......
...@@ -3,10 +3,10 @@ import paddle.fluid as fluid ...@@ -3,10 +3,10 @@ import paddle.fluid as fluid
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.ffn import PositionwiseFeedForward from parakeet.modules.ffn import PositionwiseFeedForward
from parakeet.models.transformerTTS.encoderprenet import EncoderPrenet from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
class Encoder(dg.Layer): class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config, num_head=4): def __init__(self, embedding_size, num_hidden, num_head=4):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0)) param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
...@@ -19,11 +19,11 @@ class Encoder(dg.Layer): ...@@ -19,11 +19,11 @@ class Encoder(dg.Layer):
trainable=False)) trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden, num_hidden = num_hidden,
use_cudnn=config.use_gpu) use_cudnn=True)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)] self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer) self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
......
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.models.transformerTTS.encoder import Encoder from parakeet.models.transformer_tts.encoder import Encoder
from parakeet.models.transformerTTS.decoder import Decoder from parakeet.models.transformer_tts.decoder import Decoder
class TransformerTTS(dg.Layer): class TransformerTTS(dg.Layer):
def __init__(self, config): def __init__(self, config):
super(TransformerTTS, self).__init__() super(TransformerTTS, self).__init__()
self.encoder = Encoder(config.embedding_size, config.hidden_size, config) self.encoder = Encoder(config['embedding_size'], config['hidden_size'])
self.decoder = Decoder(config.hidden_size, config) self.decoder = Decoder(config['hidden_size'], config)
self.config = config self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel): def forward(self, characters, mel_input, pos_text, pos_mel):
......
...@@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg ...@@ -2,20 +2,20 @@ import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.customized import Conv1D from parakeet.modules.customized import Conv1D
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.models.transformerTTS.CBHG import CBHG from parakeet.models.transformer_tts.CBHG import CBHG
class Vocoder(dg.Layer): class Vocoder(dg.Layer):
""" """
CBHG Network (mel -> linear) CBHG Network (mel -> linear)
""" """
def __init__(self, config): def __init__(self, config, batch_size):
super(Vocoder, self).__init__() super(Vocoder, self).__init__()
self.pre_proj = Conv1D(num_channels = config.audio.num_mels, self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'],
num_filters = config.hidden_size, num_filters = config['hidden_size'],
filter_size=1) filter_size=1)
self.cbhg = CBHG(config.hidden_size, config.batch_size) self.cbhg = CBHG(config['hidden_size'], batch_size)
self.post_proj = Conv1D(num_channels = config.hidden_size, self.post_proj = Conv1D(num_channels = config['hidden_size'],
num_filters = (config.audio.n_fft // 2) + 1, num_filters = (config['audio']['n_fft'] // 2) + 1,
filter_size=1) filter_size=1)
def forward(self, mel): def forward(self, mel):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册