# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from paddle import fluid import paddle.fluid.dygraph as dg import numpy as np from . import conv from . import weight_norm def FC(name_scope, in_features, size, num_flatten_dims=1, relu=False, dropout=0.0, epsilon=1e-30, act=None, is_test=False, dtype="float32"): """ A special Linear Layer, when it is used with dropout, the weight is initialized as normal(0, std=np.sqrt((1-dropout) / in_features)) """ # stds if isinstance(in_features, int): in_features = [in_features] stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features] if relu: stds = [std * np.sqrt(2.0) for std in stds] weight_inits = [ fluid.initializer.NormalInitializer(scale=std) for std in stds ] bias_init = fluid.initializer.ConstantInitializer(0.0) # param attrs weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits] bias_attr = fluid.ParamAttr(initializer=bias_init) layer = weight_norm.FC(name_scope, size, num_flatten_dims=num_flatten_dims, param_attr=weight_attrs, bias_attr=bias_attr, act=act, dtype=dtype) return layer def Conv1D(name_scope, in_channels, num_filters, filter_size=3, dilation=1, groups=None, causal=False, std_mul=1.0, dropout=0.0, use_cudnn=True, act=None, dtype="float32"): """ A special Conv1D Layer, when it is used with dropout, the weight is initialized as normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features))) """ # std std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels)) weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std) bias_init = fluid.initializer.ConstantInitializer(0.0) # param attrs weight_attr = fluid.ParamAttr(initializer=weight_init) bias_attr = fluid.ParamAttr(initializer=bias_init) layer = conv.Conv1D( name_scope, in_channels, num_filters, filter_size, dilation, groups=groups, causal=causal, param_attr=weight_attr, bias_attr=bias_attr, use_cudnn=use_cudnn, act=act, dtype=dtype) return layer def Embedding(name_scope, num_embeddings, embed_dim, is_sparse=False, is_distributed=False, padding_idx=None, std=0.01, dtype="float32"): # param attrs weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal( scale=std)) layer = dg.Embedding( name_scope, (num_embeddings, embed_dim), padding_idx=padding_idx, param_attr=weight_attr, dtype=dtype) return layer class Conv1DGLU(dg.Layer): """ A Convolution 1D block with GLU activation. It also applys dropout for the input x. It fuses speaker embeddings through a FC activated by softsign. It has residual connection from the input x, and scale the output by np.sqrt(0.5). """ def __init__(self, name_scope, n_speakers, speaker_dim, in_channels, num_filters, filter_size, dilation, std_mul=4.0, dropout=0.0, causal=False, residual=True, dtype="float32"): super(Conv1DGLU, self).__init__(name_scope, dtype=dtype) # conv spec self.in_channels = in_channels self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.num_filters = num_filters self.filter_size = filter_size self.dilation = dilation self.causal = causal self.residual = residual # weight init and dropout self.std_mul = std_mul self.dropout = dropout if residual: assert ( in_channels == num_filters ), "this block uses residual connection"\ "the input_channes should equals num_filters" self.conv = Conv1D( self.full_name(), in_channels, 2 * num_filters, filter_size, dilation, causal=causal, std_mul=std_mul, dropout=dropout, dtype=dtype) if n_speakers > 1: assert (speaker_dim is not None ), "speaker embed should not be null in multi-speaker case" self.fc = Conv1D( self.full_name(), speaker_dim, num_filters, filter_size=1, dilation=1, causal=False, act="softsign", dtype=dtype) def forward(self, x, speaker_embed_bc1t=None): """ Args: x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels T means input time steps. speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded speaker embed, where C_sp means speaker embedding size. Note that when using residual connection, the Conv1DGLU does not change the number of channels, so out channels equals input channels. Returns: x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where C_out means the output channels of Conv1DGLU. """ residual = x x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = self.conv(x) content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) if speaker_embed_bc1t is not None: sp = self.fc(speaker_embed_bc1t) content = content + sp # glu x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) if self.residual: x = fluid.layers.scale(x + residual, np.sqrt(0.5)) return x def add_input(self, x, speaker_embed_bc11=None): """ Inputs: x: shape(B, num_filters, 1, time_steps) speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps) Outputs: out: shape(B, num_filters, 1, time_steps), where time_steps = 1 """ residual = x # add step input and produce step output x = fluid.layers.dropout( x, self.dropout, dropout_implementation="upscale_in_train") x = self.conv.add_input(x) content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) if speaker_embed_bc11 is not None: sp = self.fc(speaker_embed_bc11) content = content + sp x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content) if self.residual: x = fluid.layers.scale(x + residual, np.sqrt(0.5)) return x def Conv1DTranspose(name_scope, in_channels, num_filters, filter_size, padding=0, stride=1, dilation=1, groups=None, std_mul=1.0, dropout=0.0, use_cudnn=True, act=None, dtype="float32"): std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size)) weight_init = fluid.initializer.NormalInitializer(scale=std) weight_attr = fluid.ParamAttr(initializer=weight_init) bias_init = fluid.initializer.ConstantInitializer(0.0) bias_attr = fluid.ParamAttr(initializer=bias_init) layer = conv.Conv1DTranspose( name_scope, in_channels, num_filters, filter_size, padding=padding, stride=stride, dilation=dilation, groups=groups, param_attr=weight_attr, bias_attr=bias_attr, use_cudnn=use_cudnn, act=act, dtype=dtype) return layer def compute_position_embedding(rad): # rad is a transposed radius, shape(embed_dim, n_vocab) embed_dim, n_vocab = rad.shape even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32")) odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32")) even_rads = fluid.layers.gather(rad, even_dims) odd_rads = fluid.layers.gather(rad, odd_dims) sines = fluid.layers.sin(even_rads) cosines = fluid.layers.cos(odd_rads) temp = fluid.layers.scatter(rad, even_dims, sines) out = fluid.layers.scatter(temp, odd_dims, cosines) out = fluid.layers.transpose(out, perm=[1, 0]) return out def position_encoding_init(n_position, d_pos_vec, position_rate=1.0, sinusoidal=True): """ Init the sinusoid position encoding table """ # keep idx 0 for padding token position encoding zero vector position_enc = np.array([[ position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec) for i in range(d_pos_vec) ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) if sinusoidal: position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 return position_enc class PositionEmbedding(dg.Layer): def __init__(self, name_scope, n_position, d_pos_vec, position_rate=1.0, is_sparse=False, is_distributed=False, param_attr=None, max_norm=None, padding_idx=None, dtype="float32"): super(PositionEmbedding, self).__init__(name_scope, dtype=dtype) self.embed = dg.Embedding( self.full_name(), size=(n_position, d_pos_vec), is_sparse=is_sparse, is_distributed=is_distributed, padding_idx=None, param_attr=param_attr, dtype=dtype) self.set_weight( position_encoding_init( n_position, d_pos_vec, position_rate=position_rate, sinusoidal=False).astype(dtype)) self._is_sparse = is_sparse self._is_distributed = is_distributed self._remote_prefetch = self._is_sparse and (not self._is_distributed) if self._remote_prefetch: assert self._is_sparse is True and self._is_distributed is False self._padding_idx = (-1 if padding_idx is None else padding_idx if padding_idx >= 0 else (n_position + padding_idx)) self._position_rate = position_rate self._max_norm = max_norm self._dtype = dtype def set_weight(self, array): assert self.embed._w.shape == list(array.shape), "shape does not match" self.embed._w._ivar.value().get_tensor().set( array, fluid.framework._current_expected_place()) def forward(self, indices, speaker_position_rate=None): """ Args: indices (Variable): Shape (B, T, 1), dtype: int64, position indices, where B means the batch size, T means the time steps. speaker_position_rate (Variable | float, optional), position rate. It can be a float point number or a Variable with shape (1,), then this speaker_position_rate is used for every example. It can also be a Variable with shape (B, 1), which contains a speaker position rate for each speaker. Returns: out (Variable): Shape(B, C_pos), position embedding, where C_pos means position embedding size. """ rad = fluid.layers.transpose(self.embed._w, perm=[1, 0]) batch_size = indices.shape[0] if speaker_position_rate is None: weight = compute_position_embedding(rad) out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="lookup_table", inputs={"Ids": indices, "W": weight}, outputs={"Out": out}, attrs={ "is_sparse": self._is_sparse, "is_distributed": self._is_distributed, "remote_prefetch": self._remote_prefetch, "padding_idx": self._padding_idx, # special value for lookup table op }) return out elif (np.isscalar(speaker_position_rate) or isinstance(speaker_position_rate, fluid.framework.Variable) and speaker_position_rate.shape == [1, 1]): # # make a weight # scale the weight (the operand for sin & cos) if np.isscalar(speaker_position_rate): scaled_rad = fluid.layers.scale(rad, speaker_position_rate) else: scaled_rad = fluid.layers.elementwise_mul( rad, speaker_position_rate[0]) weight = compute_position_embedding(scaled_rad) out = self._helper.create_variable_for_type_inference(self._dtype) self._helper.append_op( type="lookup_table", inputs={"Ids": indices, "W": weight}, outputs={"Out": out}, attrs={ "is_sparse": self._is_sparse, "is_distributed": self._is_distributed, "remote_prefetch": self._remote_prefetch, "padding_idx": self._padding_idx, # special value for lookup table op }) return out elif np.prod(speaker_position_rate.shape) > 1: assert speaker_position_rate.shape == [batch_size, 1] outputs = [] for i in range(batch_size): rate = speaker_position_rate[i] # rate has shape [1] scaled_rad = fluid.layers.elementwise_mul(rad, rate) weight = compute_position_embedding(scaled_rad) out = self._helper.create_variable_for_type_inference( self._dtype) sequence = indices[i] self._helper.append_op( type="lookup_table", inputs={"Ids": sequence, "W": weight}, outputs={"Out": out}, attrs={ "is_sparse": self._is_sparse, "is_distributed": self._is_distributed, "remote_prefetch": self._remote_prefetch, "padding_idx": -1, }) outputs.append(out) out = fluid.layers.stack(outputs) return out else: raise Exception("Then you can just use position rate at init") class Conv1D_GU(dg.Layer): def __init__(self, name_scope, conditioner_dim, in_channels, num_filters, filter_size, dilation, causal=False, residual=True, dtype="float32"): super(Conv1D_GU, self).__init__(name_scope, dtype=dtype) self.conditioner_dim = conditioner_dim self.in_channels = in_channels self.num_filters = num_filters self.filter_size = filter_size self.dilation = dilation self.causal = causal self.residual = residual if residual: assert ( in_channels == num_filters ), "this block uses residual connection"\ "the input_channels should equals num_filters" self.conv = Conv1D( self.full_name(), in_channels, 2 * num_filters, filter_size, dilation, causal=causal, dtype=dtype) self.fc = Conv1D( self.full_name(), conditioner_dim, 2 * num_filters, filter_size=1, dilation=1, causal=False, dtype=dtype) def forward(self, x, skip=None, conditioner=None): """ Args: x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU layer, where B means batch_size, C_in means the input channels T means input time steps. skip (Variable): Shape(B, C_in, 1, T), skip connection. conditioner (Variable): Shape(B, C_con, 1, T), expanded mel conditioner, where C_con is conditioner hidden dim which equals the num of mel bands. Note that when using residual connection, the Conv1D_GU does not change the number of channels, so out channels equals input channels. Returns: x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where C_out means the output channels of Conv1D_GU. skip (Variable): Shape(B, C_out, 1, T), skip connection. """ residual = x x = self.conv(x) if conditioner is not None: cond_bias = self.fc(conditioner) x += cond_bias content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) # Gated Unit. x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), fluid.layers.tanh(content)) if skip is None: skip = x else: skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) if self.residual: x = fluid.layers.scale(residual + x, np.sqrt(0.5)) return x, skip def add_input(self, x, skip=None, conditioner=None): """ Inputs: x: shape(B, num_filters, 1, time_steps) skip: shape(B, num_filters, 1, time_steps), skip connection conditioner: shape(B, conditioner_dim, 1, time_steps) Outputs: x: shape(B, num_filters, 1, time_steps), where time_steps = 1 skip: skip connection, same shape as x """ residual = x # add step input and produce step output x = self.conv.add_input(x) if conditioner is not None: cond_bias = self.fc(conditioner) x += cond_bias content, gate = fluid.layers.split(x, num_or_sections=2, dim=1) # Gated Unit. x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), fluid.layers.tanh(content)) if skip is None: skip = x else: skip = fluid.layers.scale(skip + x, np.sqrt(0.5)) if self.residual: x = fluid.layers.scale(residual + x, np.sqrt(0.5)) return x, skip def Conv2DTranspose(name_scope, num_filters, filter_size, padding=0, stride=1, dilation=1, use_cudnn=True, act=None, dtype="float32"): val = 1.0 / (filter_size[0] * filter_size[1]) weight_init = fluid.initializer.ConstantInitializer(val) weight_attr = fluid.ParamAttr(initializer=weight_init) layer = weight_norm.Conv2DTranspose( name_scope, num_filters, filter_size=filter_size, padding=padding, stride=stride, dilation=dilation, param_attr=weight_attr, use_cudnn=use_cudnn, act=act, dtype=dtype) return layer