# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from itertools import chain from collections import namedtuple from paddle import fluid import paddle.fluid.dygraph as dg import numpy as np from parakeet.modules import conv from parakeet.modules.modules import Embedding, PositionEmbedding from parakeet.modules.modules import FC, Conv1D, Conv1DGLU, Conv1DTranspose ConvSpec = namedtuple("ConvSpec", ["out_channels", "filter_size", "dilation"]) WindowRange = namedtuple("WindowRange", ["backward", "ahead"]) def expand_speaker_embed(x, speaker_embed, tdim=-1): """ Expand speaker embeddings for multiple timesteps. Args: x (Variable): A reference Variable used to determine number of timesteps. speaker_embed (Variable): Shape(B, C), embeddings of speakers, where B means batch_size, C means speaker embedding size. tdim (int, optional): The idex of time dimension in x. Defaults to -1, which means the last dimension is time dimension. Returns: Variable: Shape(B, C, 1, T), the expanded speaker embeddings, where T = x.shape[tdim]. T means number of timesteps. """ speaker_embed = fluid.layers.reshape( speaker_embed, shape=speaker_embed.shape + [1, 1]) time_steps = x.shape[tdim] speaker_embed_bc1t = fluid.layers.expand( speaker_embed, expand_times=[1, 1, 1, time_steps]) return speaker_embed_bc1t def gen_mask2(valid_lengths, max_len, dtype="float32"): """ Generate a mask tensor from valid lengths. note that it return a *reverse* mask. Indices within valid lengths correspond to 0, and those within padding area correspond to 1. Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is [[0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0]]. Args: valid_lengths (Variable): Shape(B), dtype: int64. A 1D-Tensor containing the valid lengths (timesteps) of each example, where B means beatch_size. max_len (int): The length (number of timesteps) of the mask. dtype (str, optional): A string that specifies the data type of the returned mask. Returns: mask (Variable): A mask computed from valid lengths. """ batch_size = valid_lengths.shape[0] mask = fluid.layers.sequence_mask( valid_lengths, maxlen=max_len, dtype=dtype) mask = 1 - mask return mask def expand_mask(mask, attn): """ Expand a mask for multiple time steps. This function is used by the AttentionLayer in the Decoder to expand a mask for every timestep in the decoder. Args: mask (Variable): Shape(B, T_enc), a mask generated with valid text lengths, where T_enc means encoder length(time steps). attn (Variable): Shape(B, T_dec, T_enc), a Variable stands for the alignment tensor between encoder and decoder, where T_dec means the decoder length(time_steps). Returns: mask_btc (Variable): shape(B, T_dec, T_enc), the expanded mask. """ decoder_length = attn.shape[1] mask = fluid.layers.reshape(mask, [mask.shape[0], 1, mask.shape[1]]) mask_btc = fluid.layers.expand(mask, expand_times=[1, decoder_length, 1]) return mask_btc class Encoder(dg.Layer): def __init__(self, name_scope, n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=0.1, convolutions=(ConvSpec(64, 5, 1)) * 7, max_positions=512, dropout=0.1, dtype="float32"): super(Encoder, self).__init__(name_scope, dtype=dtype) self.dropout = dropout self.embedding_weight_std = embedding_weight_std self.embed = Embedding( self.full_name(), n_vocab, embed_dim, padding_idx=padding_idx, std=embedding_weight_std, dtype=dtype) if n_speakers > 1: self.sp_proj1 = Conv1D( self.full_name(), speaker_dim, embed_dim, filter_size=1, std_mul=1.0, dropout=dropout, act="softsign", dtype=dtype) self.sp_proj2 = Conv1D( self.full_name(), speaker_dim, embed_dim, filter_size=1, std_mul=1.0, dropout=dropout, act="softsign", dtype=dtype) self.n_speakers = n_speakers self.convolutions = [] in_channels = embed_dim std_mul = 1.0 for (out_channels, filter_size, dilation) in convolutions: # 1 * 1 convolution & relu if in_channels != out_channels: self.convolutions.append( Conv1D( self.full_name(), in_channels, out_channels, filter_size=1, std_mul=std_mul, act="relu", dtype=dtype)) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul=std_mul, dropout=dropout, causal=False, residual=True, dtype=dtype)) in_channels = out_channels std_mul = 4.0 self.convolutions.append( Conv1D( self.full_name(), in_channels, embed_dim, filter_size=1, std_mul=std_mul, dropout=dropout, dtype=dtype)) for i, layer in enumerate(self.convolutions): self.add_sublayer("convolution_{}".format(i), layer) def forward(self, x, speaker_embed=None): """ Encode text sequence. Args: x (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of decoder input x. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. Returns: keys (Variable), Shape(B, C_emb, 1, T_enc), the encoded representation for keys, where C_emb menas the text embedding size. values (Variable), Shape(B, C_embed, 1, T_enc), the encoded representation for values. """ x = self.embed(x) x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = fluid.layers.transpose( fluid.layers.reshape( x, shape=x.shape + [1]), perm=[0, 2, 3, 1]) speaker_embed_bc1t = None if speaker_embed is not None: speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=3) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") x = x + self.sp_proj1(speaker_embed_bc1t) input_embed = x for layer in self.convolutions: if isinstance(layer, Conv1DGLU): x = layer(x, speaker_embed_bc1t) else: x = layer(x) if speaker_embed is not None: x = x + self.sp_proj2(speaker_embed_bc1t) keys = x values = fluid.layers.scale(input_embed + x, scale=np.sqrt(0.5)) return keys, values def freeze_embedding(self): """Fix text embedding while training.""" for param in self.embed.parameters(): param.trainable = False class AttentionLayer(dg.Layer): def __init__(self, name_scope, conv_channels, embed_dim, dropout=0.0, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True, dtype="float32"): super(AttentionLayer, self).__init__(name_scope, dtype=dtype) self.query_proj = Conv1D( self.full_name(), conv_channels, embed_dim, filter_size=1, dtype=dtype) if key_projection: self.key_proj = Conv1D( self.full_name(), embed_dim, embed_dim, filter_size=1, dtype=dtype) if value_projection: self.value_proj = Conv1D( self.full_name(), embed_dim, embed_dim, filter_size=1, dtype=dtype) self.out_proj = Conv1D( self.full_name(), embed_dim, conv_channels, filter_size=1, dtype=dtype) self.key_projection = key_projection self.value_projection = value_projection self.dropout = dropout self.window_range = window_range def forward(self, query, encoder_out, mask=None, last_attended=None): """ Compute pooled context representation and alignment scores. Args: query (Variable): shape(B, C_q, 1, T_dec), the query tensor, where C_q means the channel of query. encoder_out (Tuple(Variable, Variable)): keys (Variable): shape(B, C_emb, 1, T_enc), the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. mask (Variable, optional): Shape(B, T_enc), mask generated with valid text lengths. last_attended (int, optional): The position that received most attention at last timestep. This is only used at decoding. Outpus: x (Variable): Shape(B, C_q, 1, T_dec), the context representation pooled from attention mechanism. attn_scores (Variable): shape(B, T_dec, T_enc), the alignment tensor, where T_dec means the number of decoder time steps and T_enc means number the number of decoder time steps. """ keys, values = encoder_out residual = query if self.value_projection: values = self.value_proj(values) if self.key_projection: keys = self.key_proj(keys) x = self.query_proj(query) batch_size, conv_channels, _, decoder_length = query.shape encoder_length = keys.shape[-1] embed_dim = keys.shape[1] x = fluid.layers.matmul( fluid.layers.reshape( x, shape=[batch_size, embed_dim, decoder_length]), fluid.layers.reshape( keys, shape=[batch_size, embed_dim, encoder_length]), transpose_x=True) mask_value = -1.0e30 if mask is not None: mask = expand_mask(mask, x) neg_inf_mask = fluid.layers.scale(mask, mask_value) x = x + neg_inf_mask # if last_attended is provided, focus only on a window range around it # to enforce monotonic attention. if last_attended is not None: locality_mask = np.ones(shape=x.shape, dtype=np.float32) backward, ahead = self.window_range backward = last_attended + backward ahead = last_attended + ahead if backward < 0: backward = 0 if ahead > x.shape[-1]: ahead = x.shape[-1] locality_mask[:, :, backward:ahead] = 0. locality_mask = dg.to_variable(locality_mask) neg_inf_mask = fluid.layers.scale(locality_mask, mask_value) x = x + neg_inf_mask x = fluid.layers.softmax(x) attn_scores = x x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = fluid.layers.matmul( fluid.layers.reshape( values, shape=[batch_size, embed_dim, encoder_length]), x, transpose_y=True) x = fluid.layers.reshape(x, [batch_size, embed_dim, 1, decoder_length]) x = fluid.layers.scale(x, encoder_length * np.sqrt(1.0 / encoder_length)) x = self.out_proj(x) x = fluid.layers.scale((x + residual), np.sqrt(0.5)) return x, attn_scores class Decoder(dg.Layer): def __init__(self, name_scope, n_speakers, speaker_dim, embed_dim, mel_dim=80, r=5, max_positions=512, padding_idx=None, preattention=(ConvSpec(128, 5, 1)) * 4, convolutions=(ConvSpec(128, 5, 1)) * 4, attention=True, dropout=0.1, use_memory_mask=False, force_monotonic_attention=False, query_position_rate=1.0, key_position_rate=1.29, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True, dtype="float32"): super(Decoder, self).__init__(name_scope, dtype=dtype) self.dropout = dropout self.mel_dim = mel_dim self.r = r self.query_position_rate = query_position_rate self.key_position_rate = key_position_rate self.window_range = window_range self.n_speakers = n_speakers conv_channels = convolutions[0].out_channels self.embed_query_positions = PositionEmbedding( self.full_name(), max_positions, conv_channels, padding_idx=padding_idx, dtype=dtype) self.embed_keys_positions = PositionEmbedding( self.full_name(), max_positions, embed_dim, padding_idx=padding_idx, dtype=dtype) # Used to compute multiplier for position rate if n_speakers > 1: self.speaker_proj1 = FC(self.full_name(), speaker_dim, 1, act="sigmoid", dropout=dropout, dtype=dtype) self.speaker_proj2 = FC(self.full_name(), speaker_dim, 1, act="sigmoid", dropout=dropout, dtype=dtype) # prenet self.prenet = [] in_channels = mel_dim * r std_mul = 1.0 for (out_channels, filter_size, dilation) in preattention: if in_channels != out_channels: # conv1d & relu self.prenet.append( Conv1D( self.full_name(), in_channels, out_channels, filter_size=1, std_mul=std_mul, act="relu")) in_channels = out_channels std_mul = 2.0 self.prenet.append( Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul=std_mul, dropout=dropout, causal=True, residual=True, dtype=dtype)) in_channels = out_channels std_mul = 4.0 for i, layer in enumerate(self.prenet): self.add_sublayer("prenet_{}".format(i), layer) self.use_memory_mask = use_memory_mask if isinstance(attention, bool): self.attention = [attention] * len(convolutions) else: self.attention = attention if isinstance(force_monotonic_attention, bool): self.force_monotonic_attention = [force_monotonic_attention ] * len(convolutions) else: self.force_monotonic_attention = force_monotonic_attention # causual convolution & attention self.conv_attn = [] for use_attention, (out_channels, filter_size, dilation) in zip(self.attention, convolutions): assert ( in_channels == out_channels ), "the stack of convolution & attention does not change channels" conv_layer = Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul=std_mul, dropout=dropout, causal=True, residual=False, dtype=dtype) attn_layer = (AttentionLayer( self.full_name(), out_channels, embed_dim, dropout=dropout, window_range=window_range, key_projection=key_projection, value_projection=value_projection, dtype=dtype) if use_attention else None) in_channels = out_channels std_mul = 4.0 self.conv_attn.append((conv_layer, attn_layer)) for i, (conv_layer, attn_layer) in enumerate(self.conv_attn): self.add_sublayer("conv_{}".format(i), conv_layer) if attn_layer is not None: self.add_sublayer("attn_{}".format(i), attn_layer) # 1 * 1 conv to transform channels self.last_conv = Conv1D( self.full_name(), in_channels, mel_dim * r, filter_size=1, std_mul=std_mul, dropout=dropout, dtype=dtype) # mel (before sigmoid) to done hat self.fc = Conv1D( self.full_name(), mel_dim * r, 1, filter_size=1, dtype=dtype) # decoding configs self.max_decoder_steps = 200 self.min_decoder_steps = 10 def freeze_positional_encoding(self): for param in self.embed_query_positions.parameters(): param.trainable = False for param in self.embed_keys_positions.parameters(): param.trainable = False def forward(self, encoder_out, lengths, inputs, text_positions, frame_positions, speaker_embed=None): """ Compute decoder outputs with ground truth mel spectrogram. Args: encoder_out (Tuple(Variable, Variable)): keys (Variable): shape(B, C_emb, 1, T_enc), the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. lengths (Variable): Shape(batch_size,), dtype: int64, valid lengths of text inputs for each example. inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: int64. Positions indices for each decoder time steps. speaker_embed: shape(batch_size, speaker_dim), speaker embedding, only used for multispeaker model. Returns: outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ # pack multiple frames if necessary B, _, _, T = inputs.shape if self.r > 1 and inputs.shape[1] == self.mel_dim: if T % self.r != 0: inputs = fluid.layers.slice( inputs, axes=[3], starts=[0], ends=[T - T % self.r]) inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1]) inputs = fluid.layers.reshape( inputs, shape=[B, -1, 1, self.mel_dim * self.r]) inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1]) assert inputs.shape[3] == T // self.r if speaker_embed is not None: speaker_embed_bc1t = expand_speaker_embed(inputs, speaker_embed) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") else: speaker_embed_bc1t = None keys, values = encoder_out if self.use_memory_mask and lengths is not None: mask = gen_mask2(lengths, keys.shape[-1]) else: mask = None if text_positions is not None: w = self.key_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj1(speaker_embed), [B, -1]) text_pos_embed = self.embed_keys_positions(text_positions, w) text_pos_embed = fluid.layers.transpose( fluid.layers.reshape( text_pos_embed, shape=text_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) keys = keys + text_pos_embed if frame_positions is not None: w = self.query_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj2(speaker_embed), [B, -1]) frame_pos_embed = self.embed_query_positions(frame_positions, w) frame_pos_embed = fluid.layers.transpose( fluid.layers.reshape( frame_pos_embed, shape=frame_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) else: frame_pos_embed = None x = inputs x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: x = (layer(x, speaker_embed_bc1t) if isinstance(layer, Conv1DGLU) else layer(x)) # Convolution & Multi-hop Attention alignments = [] for conv, attn in self.conv_attn: residual = x x = conv(x, speaker_embed_bc1t) if attn is not None: if frame_pos_embed is not None: x = x + frame_pos_embed x, attn_scores = attn(x, (keys, values), mask) alignments.append(attn_scores) x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5)) alignments = fluid.layers.stack(alignments) decoder_states = x x = self.last_conv(x) outputs = fluid.layers.sigmoid(x) done = fluid.layers.sigmoid(self.fc(x)) return outputs, alignments, done, decoder_states def decode(self, encoder_out, text_positions, speaker_embed=None, initial_input=None, test_inputs=None): """ Decode without ground truth mel spectrogram. Args: encoder_out (Tuple(Variable, Variable)): keys (Variable): shape(B, C_emb, 1, T_enc), the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_embed (Variable): Shape(B, C_sp), where C_sp means speaker embedding size. It is only used for multispeaker model. initial_input (Variable, optional): Shape(B, C_mel * r, 1, 1). The input for the first time step of the decoder. If r > 0, it is a packed r frames of mel spectrograms. test_inputs (Variable, optional): Shape(B, C_mel, 1, T_test), where T_test means the time steps of test inputs. This is only used for testing this method, the user should just leave it None. Returns: outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of output mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, T_mel means the length of output mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs stops. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ self.start_new_sequence() keys, values = encoder_out B = keys.shape[0] assert B == 1, "now only supports single instance inference" mask = None # no mask because we use single instance decoding w = self.key_position_rate if speaker_embed is not None: if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj1(speaker_embed), shape=[B, -1]) speaker_embed_bc11 = fluid.layers.reshape( speaker_embed, shape=[B, speaker_embed.shape[1], 1, 1]) else: speaker_embed_bc11 = None if text_positions is not None: text_pos_embed = self.embed_keys_positions(text_positions, w) text_pos_embed = fluid.layers.transpose( fluid.layers.reshape( text_pos_embed, shape=text_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) keys = keys + text_pos_embed # start decoding, init accumulators decoder_states = [] outputs = [] alignments = [] dones = [] last_attended = [None] * len(self.conv_attn) for idx, monotonic_attn in enumerate(self.force_monotonic_attention): if monotonic_attn: last_attended[idx] = 0 t = 0 # decoder time step if initial_input is None: initial_input = fluid.layers.zeros( shape=[B, self.mel_dim * self.r, 1, 1], dtype=keys.dtype) current_input = initial_input while True: frame_pos = fluid.layers.fill_constant( shape=[B, 1, 1], value=t + 1, dtype="int64") w = self.query_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj2(speaker_embed), shape=[B, -1]) frame_pos_embed = self.embed_query_positions(frame_pos, w) frame_pos_embed = fluid.layers.transpose( fluid.layers.reshape( frame_pos_embed, shape=frame_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) if test_inputs is not None: if t >= test_inputs.shape[3]: break current_input = fluid.layers.reshape( test_inputs[:, :, :, t], shape=[B, test_inputs.shape[1], 1, 1]) else: if t > 0: current_input = outputs[-1] x = current_input x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: x = (layer.add_input(x, speaker_embed_bc11) if isinstance(layer, Conv1DGLU) else layer.add_input(x)) step_attn_scores = [] # Casual convolutions + Multi-hop attentions for i, (conv, attn) in enumerate(self.conv_attn): residual = x x = conv.add_input(x, speaker_embed_bc11) if attn is not None: if frame_pos_embed is not None: x = x + frame_pos_embed x, attn_scores = attn(x, (keys, values), mask, last_attended[i]) step_attn_scores.append(attn_scores) # update last attended when necessary if self.force_monotonic_attention[i]: last_attended[i] = np.argmax( attn_scores.numpy(), axis=-1)[0][0] x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5)) if len(step_attn_scores): average_attn_scores = fluid.layers.reduce_mean( fluid.layers.stack(step_attn_scores), dim=0) else: average_attn_scores = None decoder_state = x x = self.last_conv.add_input(x) output = fluid.layers.sigmoid(x) # (B, r * C_mel, 1, 1) done = fluid.layers.sigmoid(self.fc(x)) # (B, 1, 1, 1) decoder_states.append(decoder_state) outputs.append(output) if average_attn_scores is not None: alignments.append(average_attn_scores) dones.append(done) t += 1 if test_inputs is None: if (fluid.layers.reduce_min(done).numpy()[0] > 0.5 and t > self.min_decoder_steps): break elif t > self.max_decoder_steps: break outputs = fluid.layers.concat(outputs, axis=3) if len(alignments): alignments = fluid.layers.concat(alignments, axis=1) else: alignments = None dones = fluid.layers.concat(dones, axis=3) decoder_states = fluid.layers.concat(decoder_states, axis=3) return outputs, alignments, dones, decoder_states def start_new_sequence(self): for layer in self.sublayers(): if isinstance(layer, conv.Conv1D): layer.start_new_sequence() class Converter(dg.Layer): """ Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform. """ def __init__(self, name_scope, n_speakers, speaker_dim, in_channels, linear_dim, convolutions=(ConvSpec(256, 5, 1)) * 4, time_upsampling=1, dropout=0.1, dtype="float32"): super(Converter, self).__init__(name_scope, dtype=dtype) self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.in_channels = in_channels self.linear_dim = linear_dim self.time_upsampling = time_upsampling self.dropout = dropout target_channels = convolutions[0][0] # conv proj to target channels self.first_conv_proj = Conv1D( self.full_name(), in_channels, target_channels, filter_size=1, std_mul=1.0, dtype=dtype) # Idea from nyanko # upsampling convolitions if time_upsampling == 4: self.upsampling_convolutions = [ Conv1DTranspose( self.full_name(), target_channels, target_channels, filter_size=2, padding=0, stride=2, std_mul=1.0, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=1, std_mul=1.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DTranspose( self.full_name(), target_channels, target_channels, filter_size=2, padding=0, stride=2, std_mul=4.0, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=1, std_mul=1.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype), ] elif time_upsampling == 2: self.upsampling_convolutions = [ Conv1DTranspose( self.full_name(), target_channels, target_channels, filter_size=2, padding=0, stride=2, std_mul=1.0, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=1, std_mul=1.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype), ] elif time_upsampling == 1: self.upsampling_convolutions = [ Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype) ] else: raise ValueError("Not supported.") for i, layer in enumerate(self.upsampling_convolutions): self.add_sublayer("upsampling_convolutions_{}".format(i), layer) # post conv layers std_mul = 4.0 in_channels = target_channels self.convolutions = [] for (out_channels, filter_size, dilation) in convolutions: if in_channels != out_channels: self.convolutions.append( Conv1D( self.full_name(), in_channels, out_channels, filter_size=1, std_mul=std_mul, act="relu", dtype=dtype)) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size=filter_size, dilation=dilation, std_mul=std_mul, dropout=dropout, causal=False, residual=True, dtype=dtype)) in_channels = out_channels std_mul = 4.0 for i, layer in enumerate(self.convolutions): self.add_sublayer("convolutions_{}".format(i), layer) # final conv proj, channel transformed to linear dim self.last_conv_proj = Conv1D( self.full_name(), in_channels, linear_dim, filter_size=1, std_mul=std_mul, dropout=dropout, act="sigmoid", dtype=dtype) def forward(self, x, speaker_embed=None): """ Convert mel spectrogram or decoder hidden states to linear spectrogram. Args: x (Variable): Shape(B, C_in, 1, T_mel), converter inputs, where C_in means the input channel for the converter. Note that it can be either C_mel (channel of mel spectrogram) or C_dec // r. When use mel_spectrogram as the input of converter, C_in = C_mel; and when use decoder states as the input of converter, C_in = C_dec // r. In this scenario, decoder hidden states are treated as if they were r outputs per decoder step and are unpacked before passing to the converter. speaker_embed (Variable, optional): shape(B, C_sp), speaker embedding, where C_sp means the speaker embedding size. Returns: out (Variable): Shape(B, C_lin, 1, T_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling converter. """ speaker_embed_bc1t = None if speaker_embed is not None: speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=-1) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") x = self.first_conv_proj(x) for layer in chain(self.upsampling_convolutions, self.convolutions): # time_steps may change when timt_upsampling > 1 if (speaker_embed_bc1t is not None and speaker_embed_bc1t.shape[3] != x.shape[3]): speaker_embed_bc1t = expand_speaker_embed( x, speaker_embed, tdim=3) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") x = (layer(x, speaker_embed_bc1t) if isinstance(layer, Conv1DGLU) else layer(x)) out = self.last_conv_proj(x) return out class DeepVoiceTTS(dg.Layer): def __init__(self, name_scope, n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx, text_embedding_weight_std, freeze_text_embedding, encoder_convolutions, max_positions, position_padding_idx, trainable_positional_encodings, mel_dim, r, prenet_convolutions, attentive_convolutions, attention, use_memory_mask, force_monotonic_attention, query_position_rate, key_position_rate, window_range, key_projection, value_projection, linear_dim, postnet_convolutions, time_upsampling, dropout, use_decoder_state_for_postnet_input, dtype): super(DeepVoiceTTS, self).__init__(name_scope, dtype) self.n_speakers = n_speakers self.speaker_dim = speaker_dim if n_speakers > 1: self.speaker_embedding = Embedding( self.full_name(), n_speakers, speaker_dim, padding_idx=None, std=speaker_embedding_weight_std, dtype=dtype) self.embed_dim = embed_dim self.mel_dim = mel_dim self.r = r self.seq2seq = ConvS2S( self.full_name(), n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx, text_embedding_weight_std, freeze_text_embedding, encoder_convolutions, max_positions, position_padding_idx, trainable_positional_encodings, mel_dim, r, prenet_convolutions, attentive_convolutions, attention, use_memory_mask, force_monotonic_attention, query_position_rate, key_position_rate, window_range, key_projection, value_projection, dropout, dtype) self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input if use_decoder_state_for_postnet_input: assert ( attentive_convolutions[-1].out_channels % self.r == 0 ), "when using decoder states as converter input, you must assure the decoder state channels can be divided by r" converter_input_channels = attentive_convolutions[ -1].out_channels // r else: converter_input_channels = mel_dim self.converter_input_channels = converter_input_channels self.linear_dim = linear_dim self.converter = Converter( self.full_name(), n_speakers, speaker_dim, converter_input_channels, linear_dim, convolutions=postnet_convolutions, time_upsampling=time_upsampling, dropout=dropout, dtype=dtype) def forward(self, text_sequences, valid_lengths, mel_inputs, speaker_indices=None, text_positions=None, frame_positions=None): """ Encode text sequence and decode with ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. valid_lengths (Variable): shape(batch_size,), dtype: int64, valid lengths for each example in text_sequences. mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. speaker_indices (Variable, optional): Shape(Batch_size, 1), dtype: int64. Speaker index for each example. This arg is not None only when the model is a multispeaker model. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: int64. Positions indices for each decoder time steps. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling converter. alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. """ batch_size = text_sequences.shape[0] if self.n_speakers == 1: assert speaker_indices is None, "this model does not support multi-speaker" if speaker_indices is not None: speaker_embed = self.speaker_embedding(speaker_indices) else: speaker_embed = None mel_outputs, alignments, done, decoder_states = self.seq2seq( text_sequences, valid_lengths, mel_inputs, speaker_embed, text_positions, frame_positions) # unpack multi frames if self.r > 1: mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) mel_outputs = fluid.layers.reshape( mel_outputs, [batch_size, -1, 1, self.mel_dim]) mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) if self.use_decoder_state_for_postnet_input: postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1]) postnet_input = fluid.layers.reshape( postnet_input, [batch_size, -1, 1, self.converter_input_channels]) postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1]) else: postnet_input = mel_outputs linear_outputs = self.converter(postnet_input, speaker_embed) return mel_outputs, linear_outputs, alignments, done def transduce(self, text_sequences, text_positions, speaker_indices=None): """ Encode text sequence and decode without ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_indices (Variable, optional): Shape(Batch_size, 1), dtype: int64. Speaker index for each example. This arg is not None only when the model is a multispeaker model. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling converter. alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. """ batch_size = text_sequences.shape[0] if speaker_indices is not None: speaker_embed = self.speaker_embedding(speaker_indices) else: speaker_embed = None mel_outputs, alignments, done, decoder_states = self.seq2seq.transduce( text_sequences, text_positions, speaker_embed) if self.r > 1: mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) mel_outputs = fluid.layers.reshape( mel_outputs, [batch_size, -1, 1, self.mel_dim]) mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) if self.use_decoder_state_for_postnet_input: postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1]) postnet_input = fluid.layers.reshape( postnet_input, [batch_size, -1, 1, self.converter_input_channels]) postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1]) else: postnet_input = mel_outputs linear_outputs = self.converter(postnet_input, speaker_embed) return mel_outputs, linear_outputs, alignments, done class ConvS2S(dg.Layer): def __init__(self, name_scope, n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx, text_embedding_weight_std, freeze_text_embedding, encoder_convolutions, max_positions, position_padding_idx, trainable_positional_encodings, mel_dim, r, prenet_convolutions, attentive_convolutions, attention, use_memory_mask, force_monotonic_attention, query_position_rate, key_position_rate, window_range, key_projection, value_projection, dropout, dtype): super(ConvS2S, self).__init__(name_scope, dtype) self.freeze_text_embedding = freeze_text_embedding self.trainable_positional_encodings = trainable_positional_encodings self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.embed_dim = embed_dim self.encoder = Encoder( self.full_name(), n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=text_embedding_weight_std, convolutions=encoder_convolutions, max_positions=max_positions, dropout=dropout, dtype=dtype) if freeze_text_embedding: self.encoder.freeze_embedding() self.mel_dim = mel_dim self.r = r self.decoder = Decoder( self.full_name(), n_speakers, speaker_dim, embed_dim, mel_dim, r, max_positions, position_padding_idx, preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=dropout, use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention, query_position_rate=query_position_rate, key_position_rate=key_position_rate, window_range=window_range, key_projection=key_projection, value_projection=key_projection, dtype=dtype) if not trainable_positional_encodings: self.decoder.freeze_positional_encoding() def forward(self, text_sequences, valid_lengths, mel_inputs, speaker_embed=None, text_positions=None, frame_positions=None): """ Encode text sequence and decode with ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. valid_lengths (Variable): shape(batch_size,), dtype: int64, valid lengths for each example in text_sequences. mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: int64. Positions indices for each decoder time steps. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ keys, values = self.encoder(text_sequences, speaker_embed) mel_outputs, alignments, done, decoder_states = self.decoder( (keys, values), valid_lengths, mel_inputs, text_positions, frame_positions, speaker_embed) return mel_outputs, alignments, done, decoder_states def transduce(self, text_sequences, text_positions, speaker_embed=None): """ Encode text sequence and decode without ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ keys, values = self.encoder(text_sequences, speaker_embed) mel_outputs, alignments, done, decoder_states = self.decoder.decode( (keys, values), text_positions, speaker_embed) return mel_outputs, alignments, done, decoder_states