提交 8a9bbc26 编写于 作者: L lifuchen 提交者: chenfeiyu

add_TransformerTTS

上级 fd9e198a
......@@ -88,7 +88,7 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
mono_channel = False
lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
max_len = np.max(lengths)
max_len = np.max(lengths)
batch = []
for example in minibatch:
......
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
max_len: 50
transformer_step: 1
postnet_step: 1
use_gpu: True
checkpoint_path: ./checkpoint
log_dir: ./log
sample_path: ./sample
\ No newline at end of file
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
network:
hidden_size: 256
embedding_size: 512
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
\ No newline at end of file
audio:
num_mels: 80
n_fft: 2048
sr: 22050
preemphasis: 0.97
hop_length: 275
win_length: 1102
power: 1.2
min_level_db: -100
ref_level_db: 20
outputs_per_step: 1
network:
hidden_size: 256
embedding_size: 512
batch_size: 32
epochs: 10000
lr: 0.001
save_step: 500
image_step: 2000
use_gpu: True
use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1
save_path: ./checkpoint
log_dir: ./log
\ No newline at end of file
import math
import numpy as np
import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg
class Conv1D(dg.Layer):
"""
A convolution 1D block implemented with Conv2D. Form simplicity and
ensuring the output has the same length as the input, it does not allow
stride > 1.
"""
def __init__(self,
name_scope,
in_channels,
num_filters,
filter_size=3,
padding=0,
dilation=1,
stride=1,
groups=None,
param_attr=None,
bias_attr=None,
use_cudnn=True,
act=None,
data_format='NCT',
dtype="float32"):
super(Conv1D, self).__init__(name_scope, dtype=dtype)
self.padding = padding
self.in_channels = in_channels
self.num_filters = num_filters
self.filter_size = filter_size
self.stride = stride
self.dilation = dilation
self.padding = padding
self.act = act
self.data_format = data_format
self.conv = dg.Conv2D(
self.full_name(),
num_filters=num_filters,
filter_size=(1, filter_size),
stride=(1, stride),
dilation=(1, dilation),
padding=(0, padding),
groups=groups,
param_attr=param_attr,
bias_attr=bias_attr,
use_cudnn=use_cudnn,
act=act,
dtype=dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.conv(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class Pool1D(dg.Layer):
"""
A Pool 1D block implemented with Pool2D.
"""
def __init__(self,
name_scope,
pool_size=-1,
pool_type='max',
pool_stride=1,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
exclusive=True,
data_format='NCT',
dtype='float32'):
super(Pool1D, self).__init__(name_scope, dtype=dtype)
self.pool_size = pool_size
self.pool_type = pool_type
self.pool_stride = pool_stride
self.pool_padding = pool_padding
self.global_pooling = global_pooling
self.use_cudnn = use_cudnn
self.ceil_mode = ceil_mode
self.exclusive = exclusive
self.data_format = data_format
self.dtype = dtype
self.pool2d = dg.Pool2D(self.full_name(), [1,pool_size], pool_type = pool_type,
pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
global_pooling = global_pooling, use_cudnn = use_cudnn,
ceil_mode = ceil_mode, exclusive = exclusive, dtype = dtype)
def forward(self, x):
"""
Args:
x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
input channels.
Returns:
x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
output channels (num_filters).
"""
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
x = fluid.layers.unsqueeze(x, [2])
x = self.pool2d(x)
x = fluid.layers.squeeze(x, [2])
if self.data_format == 'NTC':
x = fluid.layers.transpose(x, [0, 2, 1])
return x
class DynamicGRU(dg.Layer):
def __init__(self,
scope_name,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size=None):
super(DynamicGRU, self).__init__(scope_name)
self.gru_unit = dg.GRUUnit(
self.full_name(),
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res
此差异已折叠。
from module import *
from utils import get_positional_table, get_sinusoid_encoding_table
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
class Encoder(dg.Layer):
def __init__(self, name_scope, embedding_size, num_hidden, config):
super(Encoder, self).__init__(name_scope)
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha')
self.alpha = self.create_parameter(param, shape=(1, ), dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(name_scope=self.full_name(),
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(name_scope = self.full_name(),
embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [FFN(self.full_name(), num_hidden, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional):
if fluid.framework._dygraph_tracer()._train_mode:
query_mask = (positional != 0).astype(float)
mask = (positional != 0).astype(float)
mask = fluid.layers.expand(fluid.layers.unsqueeze(mask,[1]), [1,x.shape[1], 1])
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1]))
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x = ffn(x)
attentions.append(attention)
return x, query_mask, attentions
class Decoder(dg.Layer):
def __init__(self, name_scope, num_hidden, config):
super(Decoder, self).__init__(name_scope)
self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha')
self.alpha = self.create_parameter(param, shape=(1,), dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(name_scope=self.full_name(),
size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = DecoderPrenet(self.full_name(),
input_size = config.audio.num_mels,
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.linear = FC(self.full_name(), num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [FFN(self.full_name(), num_hidden) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = FC(self.full_name(), num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = FC(self.full_name(), num_hidden, 1, gain = 1)
self.postconvnet = PostConvNet(self.full_name(), config)
def forward(self, key, value, query, c_mask, positional):
batch_size = key.shape[0]
decoder_len = query.shape[1]
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
#zeros = np.zeros(positional.shape, dtype=np.float32)
m_mask = (positional != 0).astype(float)
mask = np.repeat(np.expand_dims(m_mask.numpy() == 0, axis=1), decoder_len, axis=1)
mask = mask + np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
# (batch_size, decoder_len, decoder_len)
zero_mask = fluid.layers.expand(fluid.layers.unsqueeze((c_mask != 0).astype(float), axes=2), [1,1,decoder_len])
# (batch_size, decoder_len, seq_len)
zero_mask = fluid.layers.transpose(zero_mask, [0,2,1])
else:
mask = np.repeat(np.expand_dims(np.triu(np.ones([decoder_len, decoder_len]), 1), axis=0) ,batch_size, axis=0)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
#import pdb; pdb.set_trace()
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
# Get position embedding
positional = self.pos_emb(fluid.layers.unsqueeze(positional, axes=[-1]))
query = positional * self.alpha + query
#positional dropout
query = fluid.layers.dropout(query, 0.1)
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
postnet_input = layers.transpose(mel_out, [0,2,1])
out = self.postconvnet(postnet_input)
out = postnet_input + out
out = layers.transpose(out, [0,2,1])
# Stop tokens
stop_tokens = self.stop_linear(query)
return mel_out, out, attn_list, stop_tokens, selfattn_list
class Model(dg.Layer):
def __init__(self, name_scope, config):
super(Model, self).__init__(name_scope)
self.encoder = Encoder(self.full_name(), config.network.embedding_size, config.network.hidden_size, config)
self.decoder = Decoder(self.full_name(), config.network.hidden_size, config)
self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# attn_probs (128, mel_len, seq_len)
# stop_preds (batch_size, mel_len, 1)
# attns_dec (128, mel_len, mel_len)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
class ModelPostNet(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, name_scope, config):
super(ModelPostNet, self).__init__(name_scope)
self.pre_proj = Conv(self.full_name(),
in_channels = config.audio.num_mels,
out_channels = config.network.hidden_size,
data_format = "NCT")
self.cbhg = CBHG(self.full_name(), config)
self.post_proj = Conv(self.full_name(),
in_channels = config.audio.num_mels,
out_channels = (config.audio.n_fft // 2) + 1,
data_format = "NCT")
def forward(self, mel):
mel = layers.transpose(mel, [0,2,1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1])
return mag_pred
import jsonargparse
def add_config_options_to_parser(parser):
parser.add_argument('--audio.num_mels', type=int, default=80,
help="the number of mel bands when calculating mel spectrograms.")
parser.add_argument('--audio.n_fft', type=int, default=2048,
help="the number of fft components.")
parser.add_argument('--audio.sr', type=int, default=22050,
help="the sampling rate of audio data file.")
parser.add_argument('--audio.preemphasis', type=float, default=0.97,
help="the preemphasis coefficient.")
parser.add_argument('--audio.hop_length', type=float, default=128,
help="the number of samples to advance between frames.")
parser.add_argument('--audio.win_length', type=float, default=1024,
help="the length (width) of the window function.")
parser.add_argument('--audio.power', type=float, default=1.4,
help="the power to raise before griffin-lim.")
parser.add_argument('--audio.min_level_db', type=int, default=-100,
help="the minimum level db.")
parser.add_argument('--audio.ref_level_db', type=int, default=20,
help="the reference level db.")
parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.")
parser.add_argument('--network.hidden_size', type=int, default=256,
help="the hidden size in network.")
parser.add_argument('--network.embedding_size', type=int, default=512,
help="the embedding vector size.")
parser.add_argument('--batch_size', type=int, default=32,
help="batch size for training.")
parser.add_argument('--epochs', type=int, default=10000,
help="the number of epoch for training.")
parser.add_argument('--lr', type=float, default=0.001,
help="the learning rate for training.")
parser.add_argument('--save_step', type=int, default=500,
help="checkpointing interval during training.")
parser.add_argument('--image_step', type=int, default=2000,
help="attention image interval during training.")
parser.add_argument('--max_len', type=int, default=400,
help="The max length of audio when synthsis.")
parser.add_argument('--transformer_step', type=int, default=160000,
help="Global step to restore checkpoint of transformer in synthesis.")
parser.add_argument('--postnet_step', type=int, default=100000,
help="Global step to restore checkpoint of postnet in synthesis.")
parser.add_argument('--use_gpu', type=bool, default=True,
help="use gpu or not during training.")
parser.add_argument('--use_data_parallel', type=bool, default=False,
help="use data parallel or not during training.")
parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
help="the path of dataset.")
parser.add_argument('--checkpoint_path', type=str, default=None,
help="the path to load checkpoint or pretrain model.")
parser.add_argument('--save_path', type=str, default='./checkpoint',
help="the path to save checkpoint.")
parser.add_argument('--log_dir', type=str, default='./log',
help="the directory to save tensorboard log.")
parser.add_argument('--sample_path', type=str, default='./log',
help="the directory to save audio sample in synthesis.")
parser.add_argument('-c', '--config', action=jsonargparse.ActionConfigFile)
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
from parakeet import g2p
from parakeet import audio
from parakeet.data.sampler import SequentialSampler, RandomSampler, BatchSampler
from parakeet.data.dataset import Dataset
from parakeet.data.datacargo import DataCargo
from parakeet.data.batch import TextIDBatcher, SpecBatcher
_ljspeech_processor = audio.AudioProcessor(
sample_rate=22050,
num_mels=80,
min_level_db=-100,
ref_level_db=20,
n_fft=2048,
win_length= int(22050 * 0.05),
hop_length= int(22050 * 0.0125),
power=1.2,
preemphasis=0.97,
signal_norm=True,
symmetric_norm=False,
max_norm=1.,
mel_fmin=0,
mel_fmax=None,
clip_norm=True,
griffin_lim_iters=60,
do_trim_silence=False,
sound_norm=False)
class LJSpeech(Dataset):
def __init__(self, root):
super(LJSpeech, self).__init__()
assert isinstance(root, (str, Path)), "root should be a string or Path object"
self.root = root if isinstance(root, Path) else Path(root)
self.metadata = self._prepare_metadata()
def _prepare_metadata(self):
csv_path = self.root.joinpath("metadata.csv")
metadata = pd.read_csv(csv_path, sep="|", header=None, quoting=3,
names=["fname", "raw_text", "normalized_text"])
return metadata
def _get_example(self, metadatum):
"""All the code for generating an Example from a metadatum. If you want a
different preprocessing pipeline, you can override this method.
This method may require several processor, each of which has a lot of options.
In this case, you'd better pass a composed transform and pass it to the init
method.
"""
fname, raw_text, normalized_text = metadatum
wav_path = self.root.joinpath("wavs", fname + ".wav")
# load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
wav = _ljspeech_processor.load_wav(str(wav_path))
mag = _ljspeech_processor.spectrogram(wav).astype(np.float32)
mel = _ljspeech_processor.melspectrogram(wav).astype(np.float32)
phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
def _batch_examples(self, minibatch):
mag_batch = []
mel_batch = []
phoneme_batch = []
for example in minibatch:
mag, mel, phoneme = example
mag_batch.append(mag)
mel_batch.append(mel)
phoneme_batch.append(phoneme)
mag_batch = SpecBatcher(pad_value=0.)(mag_batch)
mel_batch = SpecBatcher(pad_value=0.)(mel_batch)
phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
return (mag_batch, mel_batch, phoneme_batch)
def __getitem__(self, index):
metadatum = self.metadata.iloc[index]
example = self._get_example(metadatum)
return example
def __iter__(self):
for i in range(len(self)):
yield self[i]
def __len__(self):
return len(self.metadata)
def batch_examples(batch):
texts = []
mels = []
mel_inputs = []
text_lens = []
pos_texts = []
pos_mels = []
for data in batch:
_, mel, text = data
mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
text_lens.append(len(text))
pos_texts.append(np.arange(1, len(text) + 1))
pos_mels.append(np.arange(1, mel.shape[1] + 1))
mels.append(mel)
texts.append(text)
# Sort by text_len in descending order
texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
text_lens = sorted(text_lens, reverse=True)
# Pad sequence with largest len of the batch
texts = TextIDBatcher(pad_id=0)(texts)
pos_texts = TextIDBatcher(pad_id=0)(pos_texts)
pos_mels = TextIDBatcher(pad_id=0)(pos_mels)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))
return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens))
def batch_examples_postnet(batch):
mels=[]
mags=[]
for data in batch:
mag, mel, _ = data
mels.append(mel)
mags.append(mag)
mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
return (mels, mags)
import os
from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence
import numpy as np
from network import Model, ModelPostNet
from tqdm import tqdm
from tensorboardX import SummaryWriter
import paddle.fluid as fluid
import paddle.fluid.dygraph as dg
from preprocess import _ljspeech_processor
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
def load_checkpoint(step, model_path):
model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
return model_dict
def synthesis(text_input, cfg):
place = (fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace())
# tensorboard
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'synthesis')
writer = SummaryWriter(path)
with dg.guard(place):
model = Model('transtts', cfg)
model_postnet = ModelPostNet('postnet', cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
model_postnet.set_dict(load_checkpoint(str(cfg.postnet_step), os.path.join(cfg.checkpoint_path, "postnet")))
# init input
text = np.asarray(text_to_sequence(text_input))
text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
pos_text = np.arange(1, text.shape[1]+1)
pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
model.eval()
model_postnet.eval()
pbar = tqdm(range(cfg.max_len))
for i in pbar:
pos_mel = np.arange(1, mel_input.shape[1]+1)
pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
mag_pred = model_postnet(postnet_pred)
wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
writer.add_audio(text_input, wav, 0, cfg.audio.sr)
if not os.path.exists(cfg.sample_path):
os.mkdir(cfg.sample_path)
write(os.path.join(cfg.sample_path,'test.wav'), cfg.audio.sr, wav)
if __name__ == '__main__':
parser = jsonargparse.ArgumentParser(description="Synthesis model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/synthesis.yaml'.split())
synthesis("Transformer model is so fast!", cfg)
\ No newline at end of file
from network import *
from preprocess import batch_examples_postnet, LJSpeech
from tensorboardX import SummaryWriter
import os
from tqdm import tqdm
from parakeet.data.datacargo import DataCargo
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main():
parser = jsonargparse.ArgumentParser(description="Train postnet model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_postnet.yaml'.split())
local_rank = dg.parallel.Env().local_rank
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
LJSPEECH_ROOT = Path(cfg.data_path)
dataset = LJSpeech(LJSPEECH_ROOT)
dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples_postnet, drop_last=True)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'postnet')
writer = SummaryWriter(path)
with dg.guard(place):
# dataloader
input_fields = {
'names': ['mel', 'mag'],
'shapes':
[[cfg.batch_size, None, 80], [cfg.batch_size, None, 257]],
'dtypes': ['float32', 'float32'],
'lod_levels': [0, 0]
}
inputs = [
fluid.data(
name=input_fields['names'][i],
shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i],
lod_level=input_fields['lod_levels'][i])
for i in range(len(input_fields['names']))
]
reader = fluid.io.DataLoader.from_generator(
feed_list=inputs,
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
model = ModelPostNet('postnet', cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
reader.set_batch_generator(dataloader, place)
pbar = tqdm(reader())
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
mel, mag = data
mag = dg.to_variable(mag.numpy())
mel = dg.to_variable(mel.numpy())
global_step += 1
mag_pred = model(mel)
loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
if cfg.use_data_parallel:
loss = model.scale_loss(loss)
writer.add_scalars('training_loss',{
'loss':loss.numpy(),
}, global_step)
loss.backward()
if cfg.use_data_parallel:
model.apply_collective_grads()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1))
model.clear_gradients()
if global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'postnet/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if __name__ == '__main__':
main()
\ No newline at end of file
from preprocess import batch_examples, LJSpeech
import os
from tqdm import tqdm
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
from network import *
from tensorboardX import SummaryWriter
from parakeet.data.datacargo import DataCargo
from pathlib import Path
import jsonargparse
from parse import add_config_options_to_parser
from pprint import pprint
from matplotlib import cm
class MyDataParallel(dg.parallel.DataParallel):
"""
A data parallel proxy for model.
"""
def __init__(self, layers, strategy):
super(MyDataParallel, self).__init__(layers, strategy)
def __getattr__(self, key):
if key in self.__dict__:
return object.__getattribute__(self, key)
elif key is "_layers":
return object.__getattribute__(self, "_sub_layers")["_layers"]
else:
return getattr(
object.__getattribute__(self, "_sub_layers")["_layers"], key)
def main():
parser = jsonargparse.ArgumentParser(description="Train TransformerTTS model", formatter_class='default_argparse')
add_config_options_to_parser(parser)
cfg = parser.parse_args('-c ./config/train_transformer.yaml'.split())
local_rank = dg.parallel.Env().local_rank
if local_rank == 0:
# Print the whole config setting.
pprint(jsonargparse.namespace_to_dict(cfg))
LJSPEECH_ROOT = Path(cfg.data_path)
dataset = LJSpeech(LJSPEECH_ROOT)
dataloader = DataCargo(dataset, batch_size=cfg.batch_size, shuffle=True, collate_fn=batch_examples, drop_last=True)
global_step = 0
place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
if cfg.use_data_parallel else fluid.CUDAPlace(0)
if cfg.use_gpu else fluid.CPUPlace())
if not os.path.exists(cfg.log_dir):
os.mkdir(cfg.log_dir)
path = os.path.join(cfg.log_dir,'transformer')
writer = SummaryWriter(path) if local_rank == 0 else None
with dg.guard(place):
if cfg.use_data_parallel:
strategy = dg.parallel.prepare_context()
# dataloader
input_fields = {
'names': ['character', 'mel', 'mel_input', 'pos_text', 'pos_mel', 'text_len'],
'shapes':
[[cfg.batch_size, None], [cfg.batch_size, None, 80], [cfg.batch_size, None, 80], [cfg.batch_size, 1], [cfg.batch_size, 1], [cfg.batch_size, 1]],
'dtypes': ['float32', 'float32', 'float32', 'int64', 'int64', 'int64'],
'lod_levels': [0, 0, 0, 0, 0, 0]
}
inputs = [
fluid.data(
name=input_fields['names'][i],
shape=input_fields['shapes'][i],
dtype=input_fields['dtypes'][i],
lod_level=input_fields['lod_levels'][i])
for i in range(len(input_fields['names']))
]
reader = fluid.io.DataLoader.from_generator(
feed_list=inputs,
capacity=32,
iterable=True,
use_double_buffer=True,
return_list=True)
model = Model('transtts', cfg)
model.train()
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(4000 *( cfg.lr ** 2)), 4000))
if cfg.checkpoint_path is not None:
model_dict, opti_dict = fluid.dygraph.load_dygraph(cfg.checkpoint_path)
model.set_dict(model_dict)
optimizer.set_dict(opti_dict)
print("load checkpoint!!!")
if cfg.use_data_parallel:
model = MyDataParallel(model, strategy)
for epoch in range(cfg.epochs):
reader.set_batch_generator(dataloader, place)
pbar = tqdm(reader())
for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data
global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
loss = mel_loss + post_mel_loss
if cfg.use_data_parallel:
loss = model.scale_loss(loss)
writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(),
'post_mel_loss':post_mel_loss.numpy(),
}, global_step)
writer.add_scalars('alphas', {
'encoder_alpha':model.encoder.alpha.numpy(),
'decoder_alpha':model.decoder.alpha.numpy(),
}, global_step)
writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
if global_step % cfg.image_step == 1:
for i, prob in enumerate(attn_probs):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_enc):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
for i, prob in enumerate(attn_dec):
for j in range(4):
x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
loss.backward()
if cfg.use_data_parallel:
model.apply_collective_grads()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(1))
model.clear_gradients()
# save checkpoint
if local_rank==0 and global_step % cfg.save_step == 0:
if not os.path.exists(cfg.save_path):
os.mkdir(cfg.save_path)
save_path = os.path.join(cfg.save_path,'transformer/%d' % global_step)
dg.save_dygraph(model.state_dict(), save_path)
dg.save_dygraph(optimizer.state_dict(), save_path)
if local_rank==0:
writer.close()
if __name__ =='__main__':
main()
\ No newline at end of file
import numpy as np
import librosa
import os, copy
from scipy import signal
def get_positional_table(d_pos_vec, n_position=1024):
position_enc = np.array([
[pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
return position_enc
def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
''' Sinusoid position encoding table '''
def cal_angle(position, hid_idx):
return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)
def get_posi_angle_vec(position):
return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if padding_idx is not None:
# zero vector for padding dimension
sinusoid_table[padding_idx] = 0.
return sinusoid_table
def guided_attention(N, T, g=0.2):
'''Guided attention. Refer to page 3 on the paper.'''
W = np.zeros((N, T), dtype=np.float32)
for n_pos in range(W.shape[0]):
for t_pos in range(W.shape[1]):
W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
return W
......@@ -7,4 +7,4 @@ LJSPEECH_ROOT = Path("/workspace/datasets/LJSpeech-1.1")
ljspeech = LJSpeech(LJSPEECH_ROOT)
ljspeech_cargo = DataCargo(ljspeech, batch_size=16, shuffle=True)
for i, batch in enumerate(ljspeech_cargo):
print(i)
\ No newline at end of file
print(i)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册