# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from itertools import chain from collections import namedtuple from paddle import fluid import paddle.fluid.dygraph as dg import numpy as np from deepvoice3_paddle import conv from deepvoice3_paddle.modules import Embedding, PositionEmbedding from deepvoice3_paddle.modules import FC, Conv1D, Conv1DGLU, Conv1DTranspose ConvSpec = namedtuple("ConvSpec", ["out_channels", "filter_size", "dilation"]) WindowRange = namedtuple("WindowRange", ["backward", "ahead"]) def expand_speaker_embed(x, speaker_embed, tdim=-1): """ Expand speaker embeddings for multiple timesteps. Args: x (Variable): A reference Variable used to determine number of timesteps. speaker_embed (Variable): Shape(B, C), embeddings of speakers, where B means batch_size, C means speaker embedding size. tdim (int, optional): The idex of time dimension in x. Defaults to -1, which means the last dimension is time dimension. Returns: Variable: Shape(B, C, 1, T), the expanded speaker embeddings, where T = x.shape[tdim]. T means number of timesteps. """ speaker_embed = fluid.layers.reshape( speaker_embed, shape=speaker_embed.shape + [1, 1]) time_steps = x.shape[tdim] speaker_embed_bc1t = fluid.layers.expand( speaker_embed, expand_times=[1, 1, 1, time_steps]) return speaker_embed_bc1t def gen_mask2(valid_lengths, max_len, dtype="float32"): """ Generate a mask tensor from valid lengths. note that it return a *reverse* mask. Indices within valid lengths correspond to 0, and those within padding area correspond to 1. Assume that valid_lengths = [2,5,7], and max_len = 7, the generated mask is [[0, 0, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 0]]. Args: valid_lengths (Variable): Shape(B), dtype: int64. A 1D-Tensor containing the valid lengths (timesteps) of each example, where B means beatch_size. max_len (int): The length (number of timesteps) of the mask. dtype (str, optional): A string that specifies the data type of the returned mask. Returns: mask (Variable): A mask computed from valid lengths. """ batch_size = valid_lengths.shape[0] mask = fluid.layers.sequence_mask( valid_lengths, maxlen=max_len, dtype=dtype) mask = 1 - mask return mask def expand_mask(mask, attn): """ Expand a mask for multiple time steps. This function is used by the AttentionLayer in the Decoder to expand a mask for every timestep in the decoder. Args: mask (Variable): Shape(B, T_enc), a mask generated with valid text lengths, where T_enc means encoder length(time steps). attn (Variable): Shape(B, T_dec, T_enc), a Variable stands for the alignment tensor between encoder and decoder, where T_dec means the decoder length(time_steps). Returns: mask_btc (Variable): shape(B, T_dec, T_enc), the expanded mask. """ decoder_length = attn.shape[1] mask = fluid.layers.reshape(mask, [mask.shape[0], 1, mask.shape[1]]) mask_btc = fluid.layers.expand(mask, expand_times=[1, decoder_length, 1]) return mask_btc class Encoder(dg.Layer): def __init__(self, name_scope, n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=0.1, convolutions=(ConvSpec(64, 5, 1)) * 7, max_positions=512, dropout=0.1, dtype="float32"): super(Encoder, self).__init__(name_scope, dtype=dtype) self.dropout = dropout self.embedding_weight_std = embedding_weight_std self.embed = Embedding( self.full_name(), n_vocab, embed_dim, padding_idx=padding_idx, std=embedding_weight_std, dtype=dtype) if n_speakers > 1: self.sp_proj1 = Conv1D( self.full_name(), speaker_dim, embed_dim, filter_size=1, std_mul=1.0, dropout=dropout, act="softsign", dtype=dtype) self.sp_proj2 = Conv1D( self.full_name(), speaker_dim, embed_dim, filter_size=1, std_mul=1.0, dropout=dropout, act="softsign", dtype=dtype) self.n_speakers = n_speakers self.convolutions = [] in_channels = embed_dim std_mul = 1.0 for (out_channels, filter_size, dilation) in convolutions: # 1 * 1 convolution & relu if in_channels != out_channels: self.convolutions.append( Conv1D( self.full_name(), in_channels, out_channels, filter_size=1, std_mul=std_mul, act="relu", dtype=dtype)) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul=std_mul, dropout=dropout, causal=False, residual=True, dtype=dtype)) in_channels = out_channels std_mul = 4.0 self.convolutions.append( Conv1D( self.full_name(), in_channels, embed_dim, filter_size=1, std_mul=std_mul, dropout=dropout, dtype=dtype)) for i, layer in enumerate(self.convolutions): self.add_sublayer("convolution_{}".format(i), layer) def forward(self, x, speaker_embed=None): """ Encode text sequence. Args: x (Variable): Shape(B, T_enc), dtype: int64. Ihe input text indices. T_enc means the timesteps of decoder input x. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. Returns: keys (Variable), Shape(B, C_emb, 1, T_enc), the encoded representation for keys, where C_emb menas the text embedding size. values (Variable), Shape(B, C_embed, 1, T_enc), the encoded representation for values. """ x = self.embed(x) x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = fluid.layers.transpose( fluid.layers.reshape( x, shape=x.shape + [1]), perm=[0, 2, 3, 1]) speaker_embed_bc1t = None if speaker_embed is not None: speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=3) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") x = x + self.sp_proj1(speaker_embed_bc1t) input_embed = x for layer in self.convolutions: if isinstance(layer, Conv1DGLU): x = layer(x, speaker_embed_bc1t) else: x = layer(x) if speaker_embed is not None: x = x + self.sp_proj2(speaker_embed_bc1t) keys = x values = fluid.layers.scale(input_embed + x, scale=np.sqrt(0.5)) return keys, values def freeze_embedding(self): """Fix text embedding while training.""" for param in self.embed.parameters(): param.trainable = False class AttentionLayer(dg.Layer): def __init__(self, name_scope, conv_channels, embed_dim, dropout=0.0, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True, dtype="float32"): super(AttentionLayer, self).__init__(name_scope, dtype=dtype) self.query_proj = Conv1D( self.full_name(), conv_channels, embed_dim, filter_size=1, dtype=dtype) if key_projection: self.key_proj = Conv1D( self.full_name(), embed_dim, embed_dim, filter_size=1, dtype=dtype) if value_projection: self.value_proj = Conv1D( self.full_name(), embed_dim, embed_dim, filter_size=1, dtype=dtype) self.out_proj = Conv1D( self.full_name(), embed_dim, conv_channels, filter_size=1, dtype=dtype) self.key_projection = key_projection self.value_projection = value_projection self.dropout = dropout self.window_range = window_range def forward(self, query, encoder_out, mask=None, last_attended=None): """ Compute pooled context representation and alignment scores. Args: query (Variable): shape(B, C_q, 1, T_dec), the query tensor, where C_q means the channel of query. encoder_out (Tuple(Variable, Variable)): keys (Variable): shape(B, C_emb, 1, T_enc), the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. mask (Variable, optional): Shape(B, T_enc), mask generated with valid text lengths. last_attended (int, optional): The position that received most attention at last timestep. This is only used at decoding. Outpus: x (Variable): Shape(B, C_q, 1, T_dec), the context representation pooled from attention mechanism. attn_scores (Variable): shape(B, T_dec, T_enc), the alignment tensor, where T_dec means the number of decoder time steps and T_enc means number the number of decoder time steps. """ keys, values = encoder_out residual = query if self.value_projection: values = self.value_proj(values) if self.key_projection: keys = self.key_proj(keys) x = self.query_proj(query) batch_size, conv_channels, _, decoder_length = query.shape encoder_length = keys.shape[-1] embed_dim = keys.shape[1] x = fluid.layers.matmul( fluid.layers.reshape( x, shape=[batch_size, embed_dim, decoder_length]), fluid.layers.reshape( keys, shape=[batch_size, embed_dim, encoder_length]), transpose_x=True) mask_value = -1.0e30 if mask is not None: mask = expand_mask(mask, x) neg_inf_mask = fluid.layers.scale(mask, mask_value) x = x + neg_inf_mask # if last_attended is provided, focus only on a window range around it # to enforce monotonic attention. if last_attended is not None: locality_mask = np.ones(shape=x.shape, dtype=np.float32) backward, ahead = self.window_range backward = last_attended + backward ahead = last_attended + ahead if backward < 0: backward = 0 if ahead > x.shape[-1]: ahead = x.shape[-1] locality_mask[:, :, backward:ahead] = 0. locality_mask = dg.to_variable(locality_mask) neg_inf_mask = fluid.layers.scale(locality_mask, mask_value) x = x + neg_inf_mask x = fluid.layers.softmax(x) attn_scores = x x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = fluid.layers.matmul( fluid.layers.reshape( values, shape=[batch_size, embed_dim, encoder_length]), x, transpose_y=True) x = fluid.layers.reshape(x, [batch_size, embed_dim, 1, decoder_length]) x = fluid.layers.scale(x, encoder_length * np.sqrt(1.0 / encoder_length)) x = self.out_proj(x) x = fluid.layers.scale((x + residual), np.sqrt(0.5)) return x, attn_scores class Decoder(dg.Layer): def __init__(self, name_scope, n_speakers, speaker_dim, embed_dim, mel_dim=80, r=5, max_positions=512, padding_idx=None, preattention=(ConvSpec(128, 5, 1)) * 4, convolutions=(ConvSpec(128, 5, 1)) * 4, attention=True, dropout=0.1, use_memory_mask=False, force_monotonic_attention=False, query_position_rate=1.0, key_position_rate=1.29, window_range=WindowRange(-1, 3), key_projection=True, value_projection=True, dtype="float32"): super(Decoder, self).__init__(name_scope, dtype=dtype) self.dropout = dropout self.mel_dim = mel_dim self.r = r self.query_position_rate = query_position_rate self.key_position_rate = key_position_rate self.window_range = window_range self.n_speakers = n_speakers conv_channels = convolutions[0].out_channels self.embed_query_positions = PositionEmbedding( self.full_name(), max_positions, conv_channels, padding_idx=padding_idx, dtype=dtype) self.embed_keys_positions = PositionEmbedding( self.full_name(), max_positions, embed_dim, padding_idx=padding_idx, dtype=dtype) # Used to compute multiplier for position rate if n_speakers > 1: self.speaker_proj1 = FC(self.full_name(), speaker_dim, 1, act="sigmoid", dropout=dropout, dtype=dtype) self.speaker_proj2 = FC(self.full_name(), speaker_dim, 1, act="sigmoid", dropout=dropout, dtype=dtype) # prenet self.prenet = [] in_channels = mel_dim * r std_mul = 1.0 for (out_channels, filter_size, dilation) in preattention: if in_channels != out_channels: # conv1d & relu self.prenet.append( Conv1D( self.full_name(), in_channels, out_channels, filter_size=1, std_mul=std_mul, act="relu")) in_channels = out_channels std_mul = 2.0 self.prenet.append( Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul=std_mul, dropout=dropout, causal=True, residual=True, dtype=dtype)) in_channels = out_channels std_mul = 4.0 for i, layer in enumerate(self.prenet): self.add_sublayer("prenet_{}".format(i), layer) self.use_memory_mask = use_memory_mask if isinstance(attention, bool): self.attention = [attention] * len(convolutions) else: self.attention = attention if isinstance(force_monotonic_attention, bool): self.force_monotonic_attention = [force_monotonic_attention ] * len(convolutions) else: self.force_monotonic_attention = force_monotonic_attention # causual convolution & attention self.conv_attn = [] for use_attention, (out_channels, filter_size, dilation) in zip(self.attention, convolutions): assert ( in_channels == out_channels ), "the stack of convolution & attention does not change channels" conv_layer = Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size, dilation, std_mul=std_mul, dropout=dropout, causal=True, residual=False, dtype=dtype) attn_layer = (AttentionLayer( self.full_name(), out_channels, embed_dim, dropout=dropout, window_range=window_range, key_projection=key_projection, value_projection=value_projection, dtype=dtype) if use_attention else None) in_channels = out_channels std_mul = 4.0 self.conv_attn.append((conv_layer, attn_layer)) for i, (conv_layer, attn_layer) in enumerate(self.conv_attn): self.add_sublayer("conv_{}".format(i), conv_layer) if attn_layer is not None: self.add_sublayer("attn_{}".format(i), attn_layer) # 1 * 1 conv to transform channels self.last_conv = Conv1D( self.full_name(), in_channels, mel_dim * r, filter_size=1, std_mul=std_mul, dropout=dropout, dtype=dtype) # mel (before sigmoid) to done hat self.fc = Conv1D( self.full_name(), mel_dim * r, 1, filter_size=1, dtype=dtype) # decoding configs self.max_decoder_steps = 200 self.min_decoder_steps = 10 def freeze_positional_encoding(self): for param in self.embed_query_positions.parameters(): param.trainable = False for param in self.embed_keys_positions.parameters(): param.trainable = False def forward(self, encoder_out, lengths, inputs, text_positions, frame_positions, speaker_embed=None): """ Compute decoder outputs with ground truth mel spectrogram. Args: encoder_out (Tuple(Variable, Variable)): keys (Variable): shape(B, C_emb, 1, T_enc), the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. lengths (Variable): Shape(batch_size,), dtype: int64, valid lengths of text inputs for each example. inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: int64. Positions indices for each decoder time steps. speaker_embed: shape(batch_size, speaker_dim), speaker embedding, only used for multispeaker model. Returns: outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ # pack multiple frames if necessary B, _, _, T = inputs.shape if self.r > 1 and inputs.shape[1] == self.mel_dim: if T % self.r != 0: inputs = fluid.layers.slice( inputs, axes=[3], starts=[0], ends=[T - T % self.r]) inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1]) inputs = fluid.layers.reshape( inputs, shape=[B, -1, 1, self.mel_dim * self.r]) inputs = fluid.layers.transpose(inputs, [0, 3, 2, 1]) assert inputs.shape[3] == T // self.r if speaker_embed is not None: speaker_embed_bc1t = expand_speaker_embed(inputs, speaker_embed) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") else: speaker_embed_bc1t = None keys, values = encoder_out if self.use_memory_mask and lengths is not None: mask = gen_mask2(lengths, keys.shape[-1]) else: mask = None if text_positions is not None: w = self.key_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj1(speaker_embed), [B, -1]) text_pos_embed = self.embed_keys_positions(text_positions, w) text_pos_embed = fluid.layers.transpose( fluid.layers.reshape( text_pos_embed, shape=text_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) keys = keys + text_pos_embed if frame_positions is not None: w = self.query_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj2(speaker_embed), [B, -1]) frame_pos_embed = self.embed_query_positions(frame_positions, w) frame_pos_embed = fluid.layers.transpose( fluid.layers.reshape( frame_pos_embed, shape=frame_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) else: frame_pos_embed = None x = inputs x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: x = (layer(x, speaker_embed_bc1t) if isinstance(layer, Conv1DGLU) else layer(x)) # Convolution & Multi-hop Attention alignments = [] for conv, attn in self.conv_attn: residual = x x = conv(x, speaker_embed_bc1t) if attn is not None: if frame_pos_embed is not None: x = x + frame_pos_embed x, attn_scores = attn(x, (keys, values), mask) alignments.append(attn_scores) x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5)) alignments = fluid.layers.stack(alignments) decoder_states = x x = self.last_conv(x) outputs = fluid.layers.sigmoid(x) done = fluid.layers.sigmoid(self.fc(x)) return outputs, alignments, done, decoder_states def decode(self, encoder_out, text_positions, speaker_embed=None, initial_input=None, test_inputs=None): """ Decode without ground truth mel spectrogram. Args: encoder_out (Tuple(Variable, Variable)): keys (Variable): shape(B, C_emb, 1, T_enc), the key representation from an encoder, where C_emb means text embedding size. values (Variable): shape(B, C_emb, 1, T_enc), the value representation from an encoder, where C_emb means text embedding size. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_embed (Variable): Shape(B, C_sp), where C_sp means speaker embedding size. It is only used for multispeaker model. initial_input (Variable, optional): Shape(B, C_mel * r, 1, 1). The input for the first time step of the decoder. If r > 0, it is a packed r frames of mel spectrograms. test_inputs (Variable, optional): Shape(B, C_mel, 1, T_test), where T_test means the time steps of test inputs. This is only used for testing this method, the user should just leave it None. Returns: outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of output mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, T_mel means the length of output mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs stops. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ self.start_new_sequence() keys, values = encoder_out B = keys.shape[0] assert B == 1, "now only supports single instance inference" mask = None # no mask because we use single instance decoding w = self.key_position_rate if speaker_embed is not None: if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj1(speaker_embed), shape=[B, -1]) speaker_embed_bc11 = fluid.layers.reshape( speaker_embed, shape=[B, speaker_embed.shape[1], 1, 1]) else: speaker_embed_bc11 = None if text_positions is not None: text_pos_embed = self.embed_keys_positions(text_positions, w) text_pos_embed = fluid.layers.transpose( fluid.layers.reshape( text_pos_embed, shape=text_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) keys = keys + text_pos_embed # start decoding, init accumulators decoder_states = [] outputs = [] alignments = [] dones = [] last_attended = [None] * len(self.conv_attn) for idx, monotonic_attn in enumerate(self.force_monotonic_attention): if monotonic_attn: last_attended[idx] = 0 t = 0 # decoder time step if initial_input is None: initial_input = fluid.layers.zeros( shape=[B, self.mel_dim * self.r, 1, 1], dtype=keys.dtype) current_input = initial_input while True: frame_pos = fluid.layers.fill_constant( shape=[B, 1, 1], value=t + 1, dtype="int64") w = self.query_position_rate if self.n_speakers > 1: w = w * fluid.layers.reshape( self.speaker_proj2(speaker_embed), shape=[B, -1]) frame_pos_embed = self.embed_query_positions(frame_pos, w) frame_pos_embed = fluid.layers.transpose( fluid.layers.reshape( frame_pos_embed, shape=frame_pos_embed.shape + [1]), perm=[0, 2, 3, 1]) if test_inputs is not None: if t >= test_inputs.shape[3]: break current_input = fluid.layers.reshape( test_inputs[:, :, :, t], shape=[B, test_inputs.shape[1], 1, 1]) else: if t > 0: current_input = outputs[-1] x = current_input x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") # Prenet for layer in self.prenet: x = (layer.add_input(x, speaker_embed_bc11) if isinstance(layer, Conv1DGLU) else layer.add_input(x)) step_attn_scores = [] # Casual convolutions + Multi-hop attentions for i, (conv, attn) in enumerate(self.conv_attn): residual = x x = conv.add_input(x, speaker_embed_bc11) if attn is not None: if frame_pos_embed is not None: x = x + frame_pos_embed x, attn_scores = attn(x, (keys, values), mask, last_attended[i]) step_attn_scores.append(attn_scores) # update last attended when necessary if self.force_monotonic_attention[i]: last_attended[i] = np.argmax( attn_scores.numpy(), axis=-1)[0][0] x = fluid.layers.scale(residual + x, scale=np.sqrt(0.5)) if len(step_attn_scores): average_attn_scores = fluid.layers.reduce_mean( fluid.layers.stack(step_attn_scores), dim=0) else: average_attn_scores = None decoder_state = x x = self.last_conv.add_input(x) output = fluid.layers.sigmoid(x) # (B, r * C_mel, 1, 1) done = fluid.layers.sigmoid(self.fc(x)) # (B, 1, 1, 1) decoder_states.append(decoder_state) outputs.append(output) if average_attn_scores is not None: alignments.append(average_attn_scores) dones.append(done) t += 1 if test_inputs is None: if (fluid.layers.reduce_min(done).numpy()[0] > 0.5 and t > self.min_decoder_steps): break elif t > self.max_decoder_steps: break outputs = fluid.layers.concat(outputs, axis=3) if len(alignments): alignments = fluid.layers.concat(alignments, axis=1) else: alignments = None dones = fluid.layers.concat(dones, axis=3) decoder_states = fluid.layers.concat(decoder_states, axis=3) return outputs, alignments, dones, decoder_states def start_new_sequence(self): for layer in self.sublayers(): if isinstance(layer, conv.Conv1D): layer.start_new_sequence() class Converter(dg.Layer): """ Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform. """ def __init__(self, name_scope, n_speakers, speaker_dim, in_channels, linear_dim, convolutions=(ConvSpec(256, 5, 1)) * 4, time_upsampling=1, dropout=0.1, dtype="float32"): super(Converter, self).__init__(name_scope, dtype=dtype) self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.in_channels = in_channels self.linear_dim = linear_dim self.time_upsampling = time_upsampling self.dropout = dropout target_channels = convolutions[0][0] # conv proj to target channels self.first_conv_proj = Conv1D( self.full_name(), in_channels, target_channels, filter_size=1, std_mul=1.0, dtype=dtype) # Idea from nyanko # upsampling convolitions if time_upsampling == 4: self.upsampling_convolutions = [ Conv1DTranspose( self.full_name(), target_channels, target_channels, filter_size=2, padding=0, stride=2, std_mul=1.0, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=1, std_mul=1.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DTranspose( self.full_name(), target_channels, target_channels, filter_size=2, padding=0, stride=2, std_mul=4.0, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=1, std_mul=1.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype), ] elif time_upsampling == 2: self.upsampling_convolutions = [ Conv1DTranspose( self.full_name(), target_channels, target_channels, filter_size=2, padding=0, stride=2, std_mul=1.0, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=1, std_mul=1.0, dropout=dropout, causal=False, residual=True, dtype=dtype), Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype), ] elif time_upsampling == 1: self.upsampling_convolutions = [ Conv1DGLU( self.full_name(), n_speakers, speaker_dim, target_channels, target_channels, filter_size=3, dilation=3, std_mul=4.0, dropout=dropout, causal=False, residual=True, dtype=dtype) ] else: raise ValueError("Not supported.") for i, layer in enumerate(self.upsampling_convolutions): self.add_sublayer("upsampling_convolutions_{}".format(i), layer) # post conv layers std_mul = 4.0 in_channels = target_channels self.convolutions = [] for (out_channels, filter_size, dilation) in convolutions: if in_channels != out_channels: self.convolutions.append( Conv1D( self.full_name(), in_channels, out_channels, filter_size=1, std_mul=std_mul, act="relu", dtype=dtype)) in_channels = out_channels std_mul = 2.0 self.convolutions.append( Conv1DGLU( self.full_name(), n_speakers, speaker_dim, in_channels, out_channels, filter_size=filter_size, dilation=dilation, std_mul=std_mul, dropout=dropout, causal=False, residual=True, dtype=dtype)) in_channels = out_channels std_mul = 4.0 for i, layer in enumerate(self.convolutions): self.add_sublayer("convolutions_{}".format(i), layer) # final conv proj, channel transformed to linear dim self.last_conv_proj = Conv1D( self.full_name(), in_channels, linear_dim, filter_size=1, std_mul=std_mul, dropout=dropout, act="sigmoid", dtype=dtype) def forward(self, x, speaker_embed=None): """ Convert mel spectrogram or decoder hidden states to linear spectrogram. Args: x (Variable): Shape(B, C_in, 1, T_mel), converter inputs, where C_in means the input channel for the converter. Note that it can be either C_mel (channel of mel spectrogram) or C_dec // r. When use mel_spectrogram as the input of converter, C_in = C_mel; and when use decoder states as the input of converter, C_in = C_dec // r. In this scenario, decoder hidden states are treated as if they were r outputs per decoder step and are unpacked before passing to the converter. speaker_embed (Variable, optional): shape(B, C_sp), speaker embedding, where C_sp means the speaker embedding size. Returns: out (Variable): Shape(B, C_lin, 1, T_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling converter. """ speaker_embed_bc1t = None if speaker_embed is not None: speaker_embed_bc1t = expand_speaker_embed(x, speaker_embed, tdim=-1) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") x = self.first_conv_proj(x) for layer in chain(self.upsampling_convolutions, self.convolutions): # time_steps may change when timt_upsampling > 1 if (speaker_embed_bc1t is not None and speaker_embed_bc1t.shape[3] != x.shape[3]): speaker_embed_bc1t = expand_speaker_embed( x, speaker_embed, tdim=3) speaker_embed_bc1t = fluid.layers.dropout( speaker_embed_bc1t, self.dropout, dropout_implementation="upscale_in_train") x = (layer(x, speaker_embed_bc1t) if isinstance(layer, Conv1DGLU) else layer(x)) out = self.last_conv_proj(x) return out class DeepVoiceTTS(dg.Layer): def __init__(self, name_scope, n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx, text_embedding_weight_std, freeze_text_embedding, encoder_convolutions, max_positions, position_padding_idx, trainable_positional_encodings, mel_dim, r, prenet_convolutions, attentive_convolutions, attention, use_memory_mask, force_monotonic_attention, query_position_rate, key_position_rate, window_range, key_projection, value_projection, linear_dim, postnet_convolutions, time_upsampling, dropout, use_decoder_state_for_postnet_input, dtype): super(DeepVoiceTTS, self).__init__(name_scope, dtype) self.n_speakers = n_speakers self.speaker_dim = speaker_dim if n_speakers > 1: self.speaker_embedding = Embedding( self.full_name(), n_speakers, speaker_dim, padding_idx=None, std=speaker_embedding_weight_std, dtype=dtype) self.embed_dim = embed_dim self.mel_dim = mel_dim self.r = r self.seq2seq = ConvS2S( self.full_name(), n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx, text_embedding_weight_std, freeze_text_embedding, encoder_convolutions, max_positions, position_padding_idx, trainable_positional_encodings, mel_dim, r, prenet_convolutions, attentive_convolutions, attention, use_memory_mask, force_monotonic_attention, query_position_rate, key_position_rate, window_range, key_projection, value_projection, dropout, dtype) self.use_decoder_state_for_postnet_input = use_decoder_state_for_postnet_input if use_decoder_state_for_postnet_input: assert ( attentive_convolutions[-1].out_channels % self.r == 0 ), "when using decoder states as converter input, you must assure the decoder state channels can be divided by r" converter_input_channels = attentive_convolutions[ -1].out_channels // r else: converter_input_channels = mel_dim self.converter_input_channels = converter_input_channels self.linear_dim = linear_dim self.converter = Converter( self.full_name(), n_speakers, speaker_dim, converter_input_channels, linear_dim, convolutions=postnet_convolutions, time_upsampling=time_upsampling, dropout=dropout, dtype=dtype) def forward(self, text_sequences, valid_lengths, mel_inputs, speaker_indices=None, text_positions=None, frame_positions=None): """ Encode text sequence and decode with ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. valid_lengths (Variable): shape(batch_size,), dtype: int64, valid lengths for each example in text_sequences. mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. speaker_indices (Variable, optional): Shape(Batch_size), dtype: int64. Speaker index for each example. This arg is not None only when the model is a multispeaker model. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: int64. Positions indices for each decoder time steps. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling converter. alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. """ batch_size = text_sequences.shape[0] if self.n_speakers == 1: assert speaker_indices is None, "this model does not support multi-speaker" if speaker_indices is not None: speaker_embed = self.speaker_embedding(speaker_indices) else: speaker_embed = None mel_outputs, alignments, done, decoder_states = self.seq2seq( text_sequences, valid_lengths, mel_inputs, speaker_embed, text_positions, frame_positions) # unpack multi frames if self.r > 1: mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) mel_outputs = fluid.layers.reshape( mel_outputs, [batch_size, -1, 1, self.mel_dim]) mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) if self.use_decoder_state_for_postnet_input: postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1]) postnet_input = fluid.layers.reshape( postnet_input, [batch_size, -1, 1, self.converter_input_channels]) postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1]) else: postnet_input = mel_outputs linear_outputs = self.converter(postnet_input, speaker_embed) return mel_outputs, linear_outputs, alignments, done def transduce(self, text_sequences, text_positions, speaker_indices=None): """ Encode text sequence and decode without ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_indices (Variable, optional): Shape(Batch_size, 1), dtype: int64. Speaker index for each example. This arg is not None only when the model is a multispeaker model. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. linear_outputs (Variable): Shape(B, C_lin, 1, T_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling converter. alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. """ batch_size = text_sequences.shape[0] if speaker_indices is not None: speaker_embed = self.speaker_embedding(speaker_indices) else: speaker_embed = None mel_outputs, alignments, done, decoder_states = self.seq2seq.transduce( text_sequences, text_positions, speaker_embed) if self.r > 1: mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) mel_outputs = fluid.layers.reshape( mel_outputs, [batch_size, -1, 1, self.mel_dim]) mel_outputs = fluid.layers.transpose(mel_outputs, [0, 3, 2, 1]) if self.use_decoder_state_for_postnet_input: postnet_input = fluid.layers.transpose(decoder_states, [0, 3, 2, 1]) postnet_input = fluid.layers.reshape( postnet_input, [batch_size, -1, 1, self.converter_input_channels]) postnet_input = fluid.layers.transpose(postnet_input, [0, 3, 2, 1]) else: postnet_input = mel_outputs linear_outputs = self.converter(postnet_input, speaker_embed) return mel_outputs, linear_outputs, alignments, done class ConvS2S(dg.Layer): def __init__(self, name_scope, n_speakers, speaker_dim, speaker_embedding_weight_std, n_vocab, embed_dim, text_padding_idx, text_embedding_weight_std, freeze_text_embedding, encoder_convolutions, max_positions, position_padding_idx, trainable_positional_encodings, mel_dim, r, prenet_convolutions, attentive_convolutions, attention, use_memory_mask, force_monotonic_attention, query_position_rate, key_position_rate, window_range, key_projection, value_projection, dropout, dtype): super(ConvS2S, self).__init__(name_scope, dtype) self.freeze_text_embedding = freeze_text_embedding self.trainable_positional_encodings = trainable_positional_encodings self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.embed_dim = embed_dim self.encoder = Encoder( self.full_name(), n_vocab, embed_dim, n_speakers, speaker_dim, padding_idx=None, embedding_weight_std=text_embedding_weight_std, convolutions=encoder_convolutions, max_positions=max_positions, dropout=dropout, dtype=dtype) if freeze_text_embedding: self.encoder.freeze_embedding() self.mel_dim = mel_dim self.r = r self.decoder = Decoder( self.full_name(), n_speakers, speaker_dim, embed_dim, mel_dim, r, max_positions, position_padding_idx, preattention=prenet_convolutions, convolutions=attentive_convolutions, attention=attention, dropout=dropout, use_memory_mask=use_memory_mask, force_monotonic_attention=force_monotonic_attention, query_position_rate=query_position_rate, key_position_rate=key_position_rate, window_range=window_range, key_projection=key_projection, value_projection=key_projection, dtype=dtype) if not trainable_positional_encodings: self.decoder.freeze_positional_encoding() def forward(self, text_sequences, valid_lengths, mel_inputs, speaker_embed=None, text_positions=None, frame_positions=None): """ Encode text sequence and decode with ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. valid_lengths (Variable): shape(batch_size,), dtype: int64, valid lengths for each example in text_sequences. mel_inputs (Variable): Shape(B, C_mel, 1, T_mel), ground truth mel-spectrogram, which is used as decoder inputs when training. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. frame_positions (Variable): Shape(B, T_dec // r, 1), dtype: int64. Positions indices for each decoder time steps. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(N, B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, where N means number of Attention Layers, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ keys, values = self.encoder(text_sequences, speaker_embed) mel_outputs, alignments, done, decoder_states = self.decoder( (keys, values), valid_lengths, mel_inputs, text_positions, frame_positions, speaker_embed) return mel_outputs, alignments, done, decoder_states def transduce(self, text_sequences, text_positions, speaker_embed=None): """ Encode text sequence and decode without ground truth mel spectrogram. Args: text_sequences (Variable): Shape(B, T_enc, 1), dtype: int64. Ihe input text indices. T_enc means the timesteps of text_sequences. text_positions (Variable): Shape(B, T_enc, 1), dtype: int64. Positions indices for text inputs for the encoder, where T_enc means the encoder timesteps. speaker_embed (Variable, optional): Shape(Batch_size, speaker_dim), dtype: float32. Speaker embeddings. This arg is not None only when the model is a multispeaker model. Returns: mel_outputs (Variable): Shape(B, C_mel * r, 1, T_mel // r). Decoder outputs, where C_mel means the channels of mel-spectrogram, r means the outputs per decoder step, T_mel means the length(time steps) of mel spectrogram. Note that, when r > 1, the decoder outputs r frames of mel spectrogram per step. alignments (Variable): Shape(B, T_mel // r, T_enc), the alignment tensor between the decoder and the encoder, T_mel means the length of mel spectrogram, r means the outputs per decoder step, T_enc means the encoder time steps. done (Variable): Shape(B, 1, 1, T_mel // r), probability that the outputs should stop. decoder_states (Variable): Shape(B, C_dec, 1, T_mel // r), decoder hidden states, where C_dec means the channels of decoder states. """ keys, values = self.encoder(text_sequences, speaker_embed) mel_outputs, alignments, done, decoder_states = self.decoder.decode( (keys, values), text_positions, speaker_embed) return mel_outputs, alignments, done, decoder_states