提交 5b2d2a37 编写于 作者: L lifuchen 提交者: chenfeiyu

add TransformerTTS and fastspeech

上级 185e25fe
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Pool1D, Linear
from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np
class CBHG(dg.Layer):
def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2,
max_pool_kernel_size=2, is_post=False):
super(CBHG, self).__init__()
"""
:param hidden_size: dimension of hidden unit
:param batch_size: batch size
:param K: # of convolution banks
:param projection_size: dimension of projection unit
:param num_gru_layers: # of layers of GRUcell
:param max_pool_kernel_size: max pooling kernel size
:param is_post: whether post processing or not
"""
self.hidden_size = hidden_size
self.projection_size = projection_size
self.conv_list = []
self.conv_list.append(Conv(in_channels = projection_size,
out_channels = hidden_size,
filter_size = 1,
padding = int(np.floor(1/2)),
data_format = "NCT"))
for i in range(2,K+1):
self.conv_list.append(Conv(in_channels = hidden_size,
out_channels = hidden_size,
filter_size = i,
padding = int(np.floor(i/2)),
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batchnorm_list = []
for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size,
data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list):
self.add_sublayer("batchnorm_list_{}".format(i), layer)
conv_outdim = hidden_size * K
self.conv_projection_1 = Conv(in_channels = conv_outdim,
out_channels = hidden_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.conv_projection_2 = Conv(in_channels = hidden_size,
out_channels = projection_size,
filter_size = 3,
padding = int(np.floor(3/2)),
data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max',
pool_stride=1,
pool_padding=1,
data_format = "NCT")
self.highway = Highwaynet(self.projection_size)
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0)
self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse = False,
origin_mode = True,
h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
is_reverse=True,
origin_mode=True,
h_0 = h_0)
def _conv_fit_dim(self, x, filter_size=3):
if filter_size % 2 == 0:
return x[:,:,:-1]
else:
return x
def forward(self, input_):
# input_.shape = [N, C, T]
conv_list = []
conv_input = input_
for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
conv_input = self._conv_fit_dim(conv(conv_input), i+1)
conv_input = layers.relu(batchnorm(conv_input))
conv_list.append(conv_input)
conv_cat = layers.concat(conv_list, axis=1)
conv_pool = self.max_pool(conv_cat)[:,:,:-1]
conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
# conv_proj.shape = [N, C, T]
highway = layers.transpose(conv_proj, [0,2,1])
highway = self.highway(highway)
# highway.shape = [N, T, C]
fc_forward = self.fc_forward1(highway)
fc_reverse = self.fc_reverse1(highway)
out_forward = self.gru_forward1(fc_forward)
out_reverse = self.gru_reverse1(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
fc_forward = self.fc_forward2(out)
fc_reverse = self.fc_reverse2(out)
out_forward = self.gru_forward2(fc_forward)
out_reverse = self.gru_reverse2(fc_reverse)
out = layers.concat([out_forward, out_reverse], axis=-1)
out = layers.transpose(out, [0,2,1])
return out
class Highwaynet(dg.Layer):
def __init__(self, num_units, num_layers=4):
super(Highwaynet, self).__init__()
self.num_units = num_units
self.num_layers = num_layers
self.gates = []
self.linears = []
for i in range(num_layers):
self.linears.append(Linear(num_units, num_units))
self.gates.append(Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear)
self.add_sublayer("gates_{}".format(i), gate)
def forward(self, input_):
out = input_
for linear, gate in zip(self.linears, self.gates):
h = fluid.layers.relu(linear(out))
t_ = fluid.layers.sigmoid(gate(out))
c = 1 - t_
out = h * t_ + out * c
return out
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
from parakeet.modules.prenet import PreNet
from parakeet.modules.post_convnet import PostConvNet
class Decoder(dg.Layer):
def __init__(self, num_hidden, config, num_head=4):
super(Decoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
hidden_size = num_hidden * 2,
output_size = num_hidden,
dropout_rate=0.2)
self.linear = Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.attn_layers):
self.add_sublayer("attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = Linear(num_hidden, 1)
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5,
outputs_per_step=config.audio.outputs_per_step,
use_cudnn = config.use_gpu)
def forward(self, key, value, query, c_mask, positional):
# get decoder mask with triangular matrix
if fluid.framework._dygraph_tracer()._train_mode:
m_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
mask = mask + triu_tensor
mask = fluid.layers.cast(mask == 0, np.float32)
# (batch_size, decoder_len, encoder_len)
zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
else:
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None
# Decoder pre-network
query = self.decoder_prenet(query)
# Centered position
query = self.linear(query)
# Get position embedding
positional = self.pos_emb(positional)
query = positional * self.alpha + query
#positional dropout
query = fluid.layers.dropout(query, 0.1)
# Attention decoder-decoder, encoder-decoder
selfattn_list = list()
attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query)
selfattn_list.append(attn_dec)
attn_list.append(attn_dot)
# Mel linear projection
mel_out = self.mel_linear(query)
# Post Mel Network
out = self.postconvnet(mel_out)
out = mel_out + out
# Stop tokens
stop_tokens = self.stop_linear(query)
stop_tokens = layers.squeeze(stop_tokens, [-1])
stop_tokens = layers.sigmoid(stop_tokens)
return mel_out, out, attn_list, stop_tokens, selfattn_list
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward
from parakeet.models.transformerTTS.encoderprenet import EncoderPrenet
class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config, num_head=4):
super(Encoder, self).__init__()
self.num_hidden = num_hidden
param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False))
self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size,
num_hidden = num_hidden,
use_cudnn=config.use_gpu)
self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
for i, layer in enumerate(self.layers):
self.add_sublayer("self_attn_{}".format(i), layer)
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = config.use_gpu) for _ in range(3)]
for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer)
def forward(self, x, positional):
if fluid.framework._dygraph_tracer()._train_mode:
query_mask = get_non_pad_mask(positional)
mask = get_attn_key_pad_mask(positional, x)
else:
query_mask, mask = None, None
# Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding
positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C)
# Positional dropout
x = layers.dropout(x, 0.1)
# Self attention encoder
attentions = list()
for layer, ffn in zip(self.layers, self.ffns):
x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
x = ffn(x)
attentions.append(attention)
return x, query_mask, attentions
\ No newline at end of file
import math
from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Linear
import numpy as np
class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size
self.num_hidden = num_hidden
self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
padding_idx = None)
self.conv_list = []
self.conv_list.append(Conv(in_channels = embedding_size,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT"))
for _ in range(2):
self.conv_list.append(Conv(in_channels = num_hidden,
out_channels = num_hidden,
filter_size = 5,
padding = int(np.floor(5/2)),
use_cudnn = use_cudnn,
data_format = "NCT"))
for i, layer in enumerate(self.conv_list):
self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden,
data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.projection = Linear(num_hidden, num_hidden)
def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size)
x = layers.transpose(x,[0,2,1])
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
x = layers.transpose(x,[0,2,1]) #(N,T,C)
x = self.projection(x)
return x
\ No newline at end of file
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.models.transformerTTS.encoder import Encoder
from parakeet.models.transformerTTS.decoder import Decoder
class TransformerTTS(dg.Layer):
def __init__(self, config):
super(TransformerTTS, self).__init__()
self.encoder = Encoder(config.embedding_size, config.hidden_size, config)
self.decoder = Decoder(config.hidden_size, config)
self.config = config
def forward(self, characters, mel_input, pos_text, pos_mel):
# key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel)
# attn_probs (128, mel_len, seq_len)
# stop_preds (batch_size, mel_len, 1)
# attns_dec (128, mel_len, mel_len)
mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import *
from parakeet.models.transformerTTS.CBHG import CBHG
class Vocoder(dg.Layer):
"""
CBHG Network (mel -> linear)
"""
def __init__(self, config):
super(Vocoder, self).__init__()
self.pre_proj = Conv1D(in_channels = config.audio.num_mels,
out_channels = config.hidden_size,
filter_size=1,
data_format = "NCT")
self.cbhg = CBHG(config.hidden_size, config.batch_size)
self.post_proj = Conv1D(in_channels = config.hidden_size,
out_channels = (config.audio.n_fft // 2) + 1,
filter_size=1,
data_format = "NCT")
def forward(self, mel):
mel = layers.transpose(mel, [0,2,1])
mel = self.pre_proj(mel)
mel = self.cbhg(mel)
mag_pred = self.post_proj(mel)
mag_pred = layers.transpose(mag_pred, [0,2,1])
return mag_pred
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册