提交 ab0fe8f3 编写于 作者: L lifuchen 提交者: chenfeiyu

TransformerTTS precision alignment

上级 ae88be34
...@@ -89,7 +89,7 @@ def transliteration_cleaners(text): ...@@ -89,7 +89,7 @@ def transliteration_cleaners(text):
def english_cleaners(text): def english_cleaners(text):
'''Pipeline for English text, including number and abbreviation expansion.''' '''Pipeline for English text, including number and abbreviation expansion.'''
text = convert_to_ascii(text) text = convert_to_ascii(text)
text = add_punctuation(text) #text = add_punctuation(text)
text = lowercase(text) text = lowercase(text)
text = expand_numbers(text) text = expand_numbers(text)
text = expand_abbreviations(text) text = expand_abbreviations(text)
......
...@@ -14,13 +14,11 @@ encoder_n_layer: 6 ...@@ -14,13 +14,11 @@ encoder_n_layer: 6
encoder_head: 2 encoder_head: 2
encoder_conv1d_filter_size: 1536 encoder_conv1d_filter_size: 1536
max_sep_len: 2048 max_sep_len: 2048
encoder_output_size: 384 fs_embedding_size: 384
embedding_size: 384
decoder_n_layer: 6 decoder_n_layer: 6
decoder_head: 2 decoder_head: 2
decoder_conv1d_filter_size: 1536 decoder_conv1d_filter_size: 1536
decoder_output_size: 384 fs_hidden_size: 384
hidden_size: 384
duration_predictor_output_size: 256 duration_predictor_output_size: 256
duration_predictor_filter_size: 3 duration_predictor_filter_size: 3
fft_conv1d_filter: 3 fft_conv1d_filter: 3
...@@ -28,6 +26,9 @@ fft_conv1d_padding: 1 ...@@ -28,6 +26,9 @@ fft_conv1d_padding: 1
dropout: 0.1 dropout: 0.1
transformer_head: 4 transformer_head: 4
embedding_size: 512
hidden_size: 256
warm_up_step: 4000 warm_up_step: 4000
grad_clip_thresh: 0.1 grad_clip_thresh: 0.1
batch_size: 32 batch_size: 32
...@@ -39,5 +40,5 @@ use_data_parallel: False ...@@ -39,5 +40,5 @@ use_data_parallel: False
data_path: ../../../dataset/LJSpeech-1.1 data_path: ../../../dataset/LJSpeech-1.1
transtts_path: ../transformerTTS/checkpoint transtts_path: ../transformerTTS/checkpoint
transformer_step: 20 transformer_step: 1
log_dir: ./log log_dir: ./log
\ No newline at end of file
...@@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D ...@@ -8,8 +8,6 @@ from parakeet.modules.layers import Conv1D
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward from parakeet.modules.feed_forward import PositionwiseFeedForward
class FFTBlock(dg.Layer): class FFTBlock(dg.Layer):
def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2): def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
super(FFTBlock, self).__init__() super(FFTBlock, self).__init__()
......
from utils import * from utils import *
from modules import * from modules import FFTBlock, LengthRegulator
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.g2p.text.symbols import symbols from parakeet.g2p.text.symbols import symbols
...@@ -131,38 +131,38 @@ class FastSpeech(dg.Layer): ...@@ -131,38 +131,38 @@ class FastSpeech(dg.Layer):
self.encoder = Encoder(n_src_vocab=len(symbols)+1, self.encoder = Encoder(n_src_vocab=len(symbols)+1,
len_max_seq=cfg.max_sep_len, len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size, d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.encoder_n_layer, n_layers=cfg.encoder_n_layer,
n_head=cfg.encoder_head, n_head=cfg.encoder_head,
d_k=64, d_k=64,
d_v=64, d_v=64,
d_model=cfg.hidden_size, d_model=cfg.fs_hidden_size,
d_inner=cfg.encoder_conv1d_filter_size, d_inner=cfg.encoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1) dropout=0.1)
self.length_regulator = LengthRegulator(input_size=cfg.hidden_size, self.length_regulator = LengthRegulator(input_size=cfg.fs_hidden_size,
out_channels=cfg.duration_predictor_output_size, out_channels=cfg.duration_predictor_output_size,
filter_size=cfg.duration_predictor_filter_size, filter_size=cfg.duration_predictor_filter_size,
dropout=cfg.dropout) dropout=cfg.dropout)
self.decoder = Decoder(len_max_seq=cfg.max_sep_len, self.decoder = Decoder(len_max_seq=cfg.max_sep_len,
d_word_vec=cfg.embedding_size, d_word_vec=cfg.fs_embedding_size,
n_layers=cfg.decoder_n_layer, n_layers=cfg.decoder_n_layer,
n_head=cfg.decoder_head, n_head=cfg.decoder_head,
d_k=64, d_k=64,
d_v=64, d_v=64,
d_model=cfg.hidden_size, d_model=cfg.fs_hidden_size,
d_inner=cfg.decoder_conv1d_filter_size, d_inner=cfg.decoder_conv1d_filter_size,
fft_conv1d_kernel=cfg.fft_conv1d_filter, fft_conv1d_kernel=cfg.fft_conv1d_filter,
fft_conv1d_padding=cfg.fft_conv1d_padding, fft_conv1d_padding=cfg.fft_conv1d_padding,
dropout=0.1) dropout=0.1)
self.mel_linear = dg.Linear(cfg.decoder_output_size, cfg.audio.num_mels) self.mel_linear = dg.Linear(cfg.fs_hidden_size, cfg.audio.num_mels * cfg.audio.outputs_per_step)
self.postnet = PostConvNet(n_mels=80, self.postnet = PostConvNet(n_mels=cfg.audio.num_mels,
num_hidden=512, num_hidden=512,
filter_size=5, filter_size=5,
padding=int(5 / 2), padding=int(5 / 2),
num_conv=5, num_conv=5,
outputs_per_step=1, outputs_per_step=cfg.audio.outputs_per_step,
use_cudnn=True, use_cudnn=True,
dropout=0.1) dropout=0.1)
......
...@@ -22,8 +22,8 @@ def add_config_options_to_parser(parser): ...@@ -22,8 +22,8 @@ def add_config_options_to_parser(parser):
parser.add_argument('--audio.outputs_per_step', type=int, default=1, parser.add_argument('--audio.outputs_per_step', type=int, default=1,
help="the outputs per step.") help="the outputs per step.")
parser.add_argument('--embedding_size', type=int, default=256, parser.add_argument('--fs_embedding_size', type=int, default=256,
help="the dim size of embedding.") help="the dim size of embedding of fastspeech.")
parser.add_argument('--encoder_n_layer', type=int, default=6, parser.add_argument('--encoder_n_layer', type=int, default=6,
help="the number of FFT Block in encoder.") help="the number of FFT Block in encoder.")
parser.add_argument('--encoder_head', type=int, default=2, parser.add_argument('--encoder_head', type=int, default=2,
...@@ -32,18 +32,14 @@ def add_config_options_to_parser(parser): ...@@ -32,18 +32,14 @@ def add_config_options_to_parser(parser):
help="the filter size of conv1d in encoder.") help="the filter size of conv1d in encoder.")
parser.add_argument('--max_sep_len', type=int, default=2048, parser.add_argument('--max_sep_len', type=int, default=2048,
help="the max length of sequence.") help="the max length of sequence.")
parser.add_argument('--encoder_output_size', type=int, default=256,
help="the output channel size of encoder.")
parser.add_argument('--decoder_n_layer', type=int, default=6, parser.add_argument('--decoder_n_layer', type=int, default=6,
help="the number of FFT Block in decoder.") help="the number of FFT Block in decoder.")
parser.add_argument('--decoder_head', type=int, default=2, parser.add_argument('--decoder_head', type=int, default=2,
help="the attention head number in decoder.") help="the attention head number in decoder.")
parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024, parser.add_argument('--decoder_conv1d_filter_size', type=int, default=1024,
help="the filter size of conv1d in decoder.") help="the filter size of conv1d in decoder.")
parser.add_argument('--decoder_output_size', type=int, default=256, parser.add_argument('--fs_hidden_size', type=int, default=256,
help="the output channel size of decoder.") help="the hidden size in model of fastspeech.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model.")
parser.add_argument('--duration_predictor_output_size', type=int, default=256, parser.add_argument('--duration_predictor_output_size', type=int, default=256,
help="the output size of duration predictior.") help="the output size of duration predictior.")
parser.add_argument('--duration_predictor_filter_size', type=int, default=3, parser.add_argument('--duration_predictor_filter_size', type=int, default=3,
...@@ -57,6 +53,11 @@ def add_config_options_to_parser(parser): ...@@ -57,6 +53,11 @@ def add_config_options_to_parser(parser):
parser.add_argument('--transformer_head', type=int, default=4, parser.add_argument('--transformer_head', type=int, default=4,
help="the attention head num of transformerTTS.") help="the attention head num of transformerTTS.")
parser.add_argument('--hidden_size', type=int, default=256,
help="the hidden size in model of transformerTTS.")
parser.add_argument('--embedding_size', type=int, default=256,
help="the dim size of embedding of transformerTTS.")
parser.add_argument('--warm_up_step', type=int, default=4000, parser.add_argument('--warm_up_step', type=int, default=4000,
help="the warm up step of learning rate.") help="the warm up step of learning rate.")
parser.add_argument('--grad_clip_thresh', type=float, default=1.0, parser.add_argument('--grad_clip_thresh', type=float, default=1.0,
......
...@@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols ...@@ -3,20 +3,18 @@ from parakeet.g2p.text.symbols import symbols
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.layers import Conv, Pool1D from parakeet.modules.layers import Conv, Pool1D, Linear
from parakeet.modules.dynamicGRU import DynamicGRU from parakeet.modules.dynamicGRU import DynamicGRU
import numpy as np import numpy as np
class EncoderPrenet(dg.Layer): class EncoderPrenet(dg.Layer):
def __init__(self, embedding_size, num_hidden, use_cudnn=True): def __init__(self, embedding_size, num_hidden, use_cudnn=True):
super(EncoderPrenet, self).__init__() super(EncoderPrenet, self).__init__()
self.embedding_size = embedding_size self.embedding_size = embedding_size
self.num_hidden = num_hidden self.num_hidden = num_hidden
self.use_cudnn = use_cudnn self.use_cudnn = use_cudnn
self.embedding = dg.Embedding( size = [len(symbols), embedding_size], self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
param_attr = fluid.ParamAttr(name='weight'),
padding_idx = None) padding_idx = None)
self.conv_list = [] self.conv_list = []
self.conv_list.append(Conv(in_channels = embedding_size, self.conv_list.append(Conv(in_channels = embedding_size,
...@@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer): ...@@ -37,16 +35,12 @@ class EncoderPrenet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'), data_layout='NCHW', epsilon=1e-30) for _ in range(3)]
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(3)]
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
self.projection = dg.Linear(num_hidden, num_hidden) self.projection = Linear(num_hidden, num_hidden)
def forward(self, x): def forward(self, x):
x = self.embedding(x) #(batch_size, seq_len, embending_size) x = self.embedding(x) #(batch_size, seq_len, embending_size)
...@@ -90,10 +84,6 @@ class CBHG(dg.Layer): ...@@ -90,10 +84,6 @@ class CBHG(dg.Layer):
self.batchnorm_list = [] self.batchnorm_list = []
for i in range(K): for i in range(K):
self.batchnorm_list.append(dg.BatchNorm(hidden_size, self.batchnorm_list.append(dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW')) data_layout='NCHW'))
for i, layer in enumerate(self.batchnorm_list): for i, layer in enumerate(self.batchnorm_list):
...@@ -114,16 +104,8 @@ class CBHG(dg.Layer): ...@@ -114,16 +104,8 @@ class CBHG(dg.Layer):
data_format = "NCT") data_format = "NCT")
self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, self.batchnorm_proj_1 = dg.BatchNorm(hidden_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') data_layout='NCHW')
self.batchnorm_proj_2 = dg.BatchNorm(projection_size, self.batchnorm_proj_2 = dg.BatchNorm(projection_size,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') data_layout='NCHW')
self.max_pool = Pool1D(pool_size = max_pool_kernel_size, self.max_pool = Pool1D(pool_size = max_pool_kernel_size,
pool_type='max', pool_type='max',
...@@ -134,32 +116,24 @@ class CBHG(dg.Layer): ...@@ -134,32 +116,24 @@ class CBHG(dg.Layer):
h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32") h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
h_0 = dg.to_variable(h_0) h_0 = dg.to_variable(h_0)
self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_forward1 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_reverse1 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2, self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False, is_reverse = False,
origin_mode = True, origin_mode = True,
h_0 = h_0) h_0 = h_0)
self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2, self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True, is_reverse=True,
origin_mode=True, origin_mode=True,
h_0 = h_0) h_0 = h_0)
self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_forward2 = Linear(hidden_size, hidden_size // 2 * 3)
self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3) self.fc_reverse2 = Linear(hidden_size, hidden_size // 2 * 3)
self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2, self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse = False, is_reverse = False,
origin_mode = True, origin_mode = True,
h_0 = h_0) h_0 = h_0)
self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2, self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
is_reverse=True, is_reverse=True,
origin_mode=True, origin_mode=True,
h_0 = h_0) h_0 = h_0)
...@@ -216,8 +190,8 @@ class Highwaynet(dg.Layer): ...@@ -216,8 +190,8 @@ class Highwaynet(dg.Layer):
self.linears = [] self.linears = []
for i in range(num_layers): for i in range(num_layers):
self.linears.append(dg.Linear(num_units, num_units)) self.linears.append(Linear(num_units, num_units))
self.gates.append(dg.Linear(num_units, num_units)) self.gates.append(Linear(num_units, num_units))
for i, (linear, gate) in enumerate(zip(self.linears,self.gates)): for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
self.add_sublayer("linears_{}".format(i), linear) self.add_sublayer("linears_{}".format(i), linear)
......
from parakeet.models.transformerTTS.module import * from parakeet.models.transformerTTS.module import *
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid as fluid import paddle.fluid as fluid
from parakeet.modules.layers import Conv1D from parakeet.modules.layers import Conv1D, Linear
from parakeet.modules.utils import * from parakeet.modules.utils import *
from parakeet.modules.multihead_attention import MultiheadAttention from parakeet.modules.multihead_attention import MultiheadAttention
from parakeet.modules.feed_forward import PositionwiseFeedForward from parakeet.modules.feed_forward import PositionwiseFeedForward
...@@ -13,8 +13,7 @@ class Encoder(dg.Layer): ...@@ -13,8 +13,7 @@ class Encoder(dg.Layer):
def __init__(self, embedding_size, num_hidden, config): def __init__(self, embedding_size, num_hidden, config):
super(Encoder, self).__init__() super(Encoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha', param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
initializer=fluid.initializer.Constant(value=1.0))
self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32') self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden], self.pos_emb = dg.Embedding(size=[1024, num_hidden],
...@@ -39,13 +38,13 @@ class Encoder(dg.Layer): ...@@ -39,13 +38,13 @@ class Encoder(dg.Layer):
else: else:
query_mask, mask = None, None query_mask, mask = None, None
# Encoder pre_network # Encoder pre_network
x = self.encoder_prenet(x) #(N,T,C) x = self.encoder_prenet(x) #(N,T,C)
# Get positional encoding # Get positional encoding
positional = self.pos_emb(positional) positional = self.pos_emb(positional)
x = positional * self.alpha + x #(N, T, C) x = positional * self.alpha + x #(N, T, C)
...@@ -65,21 +64,20 @@ class Decoder(dg.Layer): ...@@ -65,21 +64,20 @@ class Decoder(dg.Layer):
def __init__(self, num_hidden, config): def __init__(self, num_hidden, config):
super(Decoder, self).__init__() super(Decoder, self).__init__()
self.num_hidden = num_hidden self.num_hidden = num_hidden
param = fluid.ParamAttr(name='alpha') param = fluid.ParamAttr()
self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32', self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
default_initializer = fluid.initializer.ConstantInitializer(value=1.0)) default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0) self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
self.pos_emb = dg.Embedding(size=[1024, num_hidden], self.pos_emb = dg.Embedding(size=[1024, num_hidden],
padding_idx=0, padding_idx=0,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name='weight',
initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp), initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
trainable=False)) trainable=False))
self.decoder_prenet = PreNet(input_size = config.audio.num_mels, self.decoder_prenet = PreNet(input_size = config.audio.num_mels,
hidden_size = num_hidden * 2, hidden_size = num_hidden * 2,
output_size = num_hidden, output_size = num_hidden,
dropout_rate=0.2) dropout_rate=0.2)
self.linear = dg.Linear(num_hidden, num_hidden) self.linear = Linear(num_hidden, num_hidden)
self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)] self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//4, num_hidden//4) for _ in range(3)]
for i, layer in enumerate(self.selfattn_layers): for i, layer in enumerate(self.selfattn_layers):
...@@ -90,8 +88,8 @@ class Decoder(dg.Layer): ...@@ -90,8 +88,8 @@ class Decoder(dg.Layer):
self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)] self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*4, filter_size=1) for _ in range(3)]
for i, layer in enumerate(self.ffns): for i, layer in enumerate(self.ffns):
self.add_sublayer("ffns_{}".format(i), layer) self.add_sublayer("ffns_{}".format(i), layer)
self.mel_linear = dg.Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step) self.mel_linear = Linear(num_hidden, config.audio.num_mels * config.audio.outputs_per_step)
self.stop_linear = dg.Linear(num_hidden, 1) self.stop_linear = Linear(num_hidden, 1)
self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size, self.postconvnet = PostConvNet(config.audio.num_mels, config.hidden_size,
filter_size = 5, padding = 4, num_conv=5, filter_size = 5, padding = 4, num_conv=5,
...@@ -115,10 +113,10 @@ class Decoder(dg.Layer): ...@@ -115,10 +113,10 @@ class Decoder(dg.Layer):
mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32) mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32) mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
m_mask, zero_mask = None, None m_mask, zero_mask = None, None
# Decoder pre-network # Decoder pre-network
query = self.decoder_prenet(query) query = self.decoder_prenet(query)
# Centered position # Centered position
query = self.linear(query) query = self.linear(query)
...@@ -132,14 +130,13 @@ class Decoder(dg.Layer): ...@@ -132,14 +130,13 @@ class Decoder(dg.Layer):
# Attention decoder-decoder, encoder-decoder # Attention decoder-decoder, encoder-decoder
selfattn_list = list() selfattn_list = list()
attn_list = list() attn_list = list()
for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns): for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask) query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask) query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
query = ffn(query) query = ffn(query)
selfattn_list.append(attn_dec) selfattn_list.append(attn_dec)
attn_list.append(attn_dot) attn_list.append(attn_dot)
# Mel linear projection # Mel linear projection
mel_out = self.mel_linear(query) mel_out = self.mel_linear(query)
# Post Mel Network # Post Mel Network
...@@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer): ...@@ -164,7 +161,7 @@ class TransformerTTS(dg.Layer):
# key (batch_size, seq_len, channel) # key (batch_size, seq_len, channel)
# c_mask (batch_size, seq_len) # c_mask (batch_size, seq_len)
# attns_enc (channel / 2, seq_len, seq_len) # attns_enc (channel / 2, seq_len, seq_len)
key, c_mask, attns_enc = self.encoder(characters, pos_text) key, c_mask, attns_enc = self.encoder(characters, pos_text)
# mel_output/postnet_output (batch_size, mel_len, n_mel) # mel_output/postnet_output (batch_size, mel_len, n_mel)
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
from scipy.io.wavfile import write from scipy.io.wavfile import write
from parakeet.g2p.en import text_to_sequence from parakeet.g2p.en import text_to_sequence
import numpy as np import numpy as np
from network import Model, ModelPostNet from network import TransformerTTS, ModelPostNet
from tqdm import tqdm from tqdm import tqdm
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
import paddle.fluid as fluid import paddle.fluid as fluid
...@@ -28,7 +28,7 @@ def synthesis(text_input, cfg): ...@@ -28,7 +28,7 @@ def synthesis(text_input, cfg):
writer = SummaryWriter(path) writer = SummaryWriter(path)
with dg.guard(place): with dg.guard(place):
model = Model(cfg) model = TransformerTTS(cfg)
model_postnet = ModelPostNet(cfg) model_postnet = ModelPostNet(cfg)
model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))) model.set_dict(load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")))
......
...@@ -89,8 +89,6 @@ def main(cfg): ...@@ -89,8 +89,6 @@ def main(cfg):
else: else:
loss.backward() loss.backward()
optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh)) optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg.grad_clip_thresh))
print("===============",model.pre_proj.conv.weight.numpy())
print("===============",model.pre_proj.conv.weight.gradient())
model.clear_gradients() model.clear_gradients()
if local_rank==0: if local_rank==0:
......
...@@ -63,7 +63,7 @@ def main(cfg): ...@@ -63,7 +63,7 @@ def main(cfg):
optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step), optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg.warm_up_step *( cfg.lr ** 2)), cfg.warm_up_step),
parameter_list=model.parameters()) parameter_list=model.parameters())
reader = LJSpeechLoader(cfg, nranks, local_rank).reader() reader = LJSpeechLoader(cfg, nranks, local_rank, shuffle=True).reader()
if cfg.checkpoint_path is not None: if cfg.checkpoint_path is not None:
model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer")) model_dict, opti_dict = load_checkpoint(str(cfg.transformer_step), os.path.join(cfg.checkpoint_path, "transformer"))
...@@ -78,26 +78,25 @@ def main(cfg): ...@@ -78,26 +78,25 @@ def main(cfg):
for epoch in range(cfg.epochs): for epoch in range(cfg.epochs):
pbar = tqdm(reader) pbar = tqdm(reader)
for i, data in enumerate(pbar): for i, data in enumerate(pbar):
pbar.set_description('Processing at epoch %d'%epoch) pbar.set_description('Processing at epoch %d'%epoch)
character, mel, mel_input, pos_text, pos_mel, text_length = data character, mel, mel_input, pos_text, pos_mel, text_length = data
global_step += 1 global_step += 1
mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
label = np.zeros(stop_preds.shape).astype(np.float32) label = np.zeros(stop_preds.shape).astype(np.float32)
text_length = text_length.numpy() text_length = text_length.numpy()
for i in range(label.shape[0]): for i in range(label.shape[0]):
label[i][text_length[i] - 1] = 1 label[i][text_length[i] - 1] = 1
mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel))) mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel))) post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
stop_loss = cross_entropy(stop_preds, dg.to_variable(label)) stop_loss = cross_entropy(stop_preds, dg.to_variable(label))
loss = mel_loss + post_mel_loss + stop_loss loss = mel_loss + post_mel_loss + stop_loss
if local_rank==0: if local_rank==0:
writer.add_scalars('training_loss', { writer.add_scalars('training_loss', {
'mel_loss':mel_loss.numpy(), 'mel_loss':mel_loss.numpy(),
......
...@@ -5,6 +5,25 @@ import paddle ...@@ -5,6 +5,25 @@ import paddle
from paddle import fluid from paddle import fluid
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
class Linear(dg.Layer):
def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.dtype = dtype
self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
self.bias = is_bias
if is_bias is not False:
k = math.sqrt(1 / in_features)
self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
bias_attr = self.bias,)
def forward(self, x):
x = self.linear(x)
return x
class Conv(dg.Layer): class Conv(dg.Layer):
def __init__(self, in_channels, out_channels, filter_size=1, def __init__(self, in_channels, out_channels, filter_size=1,
......
...@@ -2,6 +2,7 @@ import math ...@@ -2,6 +2,7 @@ import math
import numpy as np import numpy as np
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.layers import Linear
class ScaledDotProductAttention(dg.Layer): class ScaledDotProductAttention(dg.Layer):
def __init__(self, d_key): def __init__(self, d_key):
...@@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer): ...@@ -34,10 +35,10 @@ class ScaledDotProductAttention(dg.Layer):
attention = attention * mask attention = attention * mask
mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1) mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
attention = attention + mask attention = attention + mask
attention = layers.softmax(attention) attention = layers.softmax(attention)
attention = layers.dropout(attention, dropout) attention = layers.dropout(attention, dropout)
# Mask query to ignore padding # Mask query to ignore padding
if query_mask is not None: if query_mask is not None:
attention = attention * query_mask attention = attention * query_mask
...@@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer): ...@@ -54,13 +55,13 @@ class MultiheadAttention(dg.Layer):
self.d_q = d_q self.d_q = d_q
self.dropout = dropout self.dropout = dropout
self.key = dg.Linear(num_hidden, num_head * d_k) self.key = Linear(num_hidden, num_head * d_k, is_bias=False)
self.value = dg.Linear(num_hidden, num_head * d_k) self.value = Linear(num_hidden, num_head * d_k, is_bias=False)
self.query = dg.Linear(num_hidden, num_head * d_q) self.query = Linear(num_hidden, num_head * d_q, is_bias=False)
self.scal_attn = ScaledDotProductAttention(d_k) self.scal_attn = ScaledDotProductAttention(d_k)
self.fc = dg.Linear(num_head * d_q, num_hidden) self.fc = Linear(num_head * d_q * 2, num_hidden)
self.layer_norm = dg.LayerNorm(num_hidden) self.layer_norm = dg.LayerNorm(num_hidden)
...@@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer): ...@@ -105,6 +106,7 @@ class MultiheadAttention(dg.Layer):
result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q]) result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1]) result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
result = layers.concat([query_input,result], axis=-1)
result = layers.dropout(self.fc(result), self.dropout) result = layers.dropout(self.fc(result), self.dropout)
result = result + query_input result = result + query_input
......
...@@ -16,6 +16,7 @@ class PostConvNet(dg.Layer): ...@@ -16,6 +16,7 @@ class PostConvNet(dg.Layer):
super(PostConvNet, self).__init__() super(PostConvNet, self).__init__()
self.dropout = dropout self.dropout = dropout
self.num_conv = num_conv
self.conv_list = [] self.conv_list = []
self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step, self.conv_list.append(Conv(in_channels = n_mels * outputs_per_step,
out_channels = num_hidden, out_channels = num_hidden,
...@@ -43,17 +44,9 @@ class PostConvNet(dg.Layer): ...@@ -43,17 +44,9 @@ class PostConvNet(dg.Layer):
self.add_sublayer("conv_list_{}".format(i), layer) self.add_sublayer("conv_list_{}".format(i), layer)
self.batch_norm_list = [dg.BatchNorm(num_hidden, self.batch_norm_list = [dg.BatchNorm(num_hidden,
param_attr = fluid.ParamAttr(name='weight'),
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW') for _ in range(num_conv-1)] data_layout='NCHW') for _ in range(num_conv-1)]
self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, #self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step,
param_attr = fluid.ParamAttr(name='weight'), # data_layout='NCHW'))
bias_attr = fluid.ParamAttr(name='bias'),
moving_mean_name = 'moving_mean',
moving_variance_name = 'moving_var',
data_layout='NCHW'))
for i, layer in enumerate(self.batch_norm_list): for i, layer in enumerate(self.batch_norm_list):
self.add_sublayer("batch_norm_list_{}".format(i), layer) self.add_sublayer("batch_norm_list_{}".format(i), layer)
...@@ -67,9 +60,15 @@ class PostConvNet(dg.Layer): ...@@ -67,9 +60,15 @@ class PostConvNet(dg.Layer):
Returns: Returns:
output (Variable), Shape(B, T, C), the result after postconvnet. output (Variable), Shape(B, T, C), the result after postconvnet.
""" """
input = layers.transpose(input, [0,2,1]) input = layers.transpose(input, [0,2,1])
len = input.shape[-1] len = input.shape[-1]
for batch_norm, conv in zip(self.batch_norm_list, self.conv_list): for i in range(self.num_conv-1):
batch_norm = self.batch_norm_list[i]
conv = self.conv_list[i]
input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout) input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
conv = self.conv_list[self.num_conv-1]
input = conv(input)[:,:,:len]
output = layers.transpose(input, [0,2,1]) output = layers.transpose(input, [0,2,1])
return output return output
\ No newline at end of file
import paddle.fluid.dygraph as dg import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from parakeet.modules.layers import Linear
class PreNet(dg.Layer): class PreNet(dg.Layer):
def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2): def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
...@@ -14,8 +15,8 @@ class PreNet(dg.Layer): ...@@ -14,8 +15,8 @@ class PreNet(dg.Layer):
self.output_size = output_size self.output_size = output_size
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.linear1 = dg.Linear(input_size, hidden_size) self.linear1 = Linear(input_size, hidden_size)
self.linear2 = dg.Linear(hidden_size, output_size) self.linear2 = Linear(hidden_size, output_size)
def forward(self, x): def forward(self, x):
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册