import numpy as np from paddle import fluid import paddle.fluid.dygraph as dg import paddle.fluid.layers as F import paddle.fluid.initializer as I from parakeet.modules.weight_norm import Conv1D, Conv1DCell, Conv2D, Linear class Conv1DGLU(dg.Layer): """ A Convolution 1D block with GLU activation. It also applys dropout for the input x. It fuses speaker embeddings through a FC activated by softsign. It has residual connection from the input x, and scale the output by np.sqrt(0.5). """ def __init__(self, n_speakers, speaker_dim, in_channels, num_filters, filter_size=1, dilation=1, std_mul=4.0, dropout=0.0, causal=False, residual=True): super(Conv1DGLU, self).__init__() # conv spec self.in_channels = in_channels self.n_speakers = n_speakers self.speaker_dim = speaker_dim self.num_filters = num_filters self.filter_size = filter_size self.dilation = dilation # padding self.causal = causal # weight init and dropout self.std_mul = std_mul self.dropout = dropout self.residual = residual if residual: assert ( in_channels == num_filters ), "this block uses residual connection"\ "the input_channes should equals num_filters" std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels)) self.conv = Conv1DCell(in_channels, 2 * num_filters, filter_size, dilation, causal, param_attr=I.Normal(scale=std)) if n_speakers > 1: assert (speaker_dim is not None ), "speaker embed should not be null in multi-speaker case" std = np.sqrt(1 / speaker_dim) self.fc = Linear(speaker_dim, num_filters, param_attr=I.Normal(scale=std)) def forward(self, x, speaker_embed=None): """ Args: x (Variable): Shape(B, C_in, T), the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels T means input time steps. speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded speaker embed, where C_sp means speaker embedding size. Note that when using residual connection, the Conv1DGLU does not change the number of channels, so out channels equals input channels. Returns: x (Variable): Shape(B, C_out, T), the output of Conv1DGLU, where C_out means the output channels of Conv1DGLU. """ residual = x x = F.dropout(x, self.dropout, dropout_implementation="upscale_in_train") x = self.conv(x) content, gate = F.split(x, num_or_sections=2, dim=1) if speaker_embed is not None: sp = F.softsign(self.fc(speaker_embed)) content = F.elementwise_add(content, sp, axis=0) # glu x = F.sigmoid(gate) * content if self.residual: x = F.scale(x + residual, np.sqrt(0.5)) return x def start_sequence(self): self.conv.start_sequence() def add_input(self, x_t, speaker_embed=None): """ Args: x (Variable): Shape(B, C_in), the input of Conv1DGLU layer, where B means batch_size, C_in means the input channels. speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded speaker embed, where C_sp means speaker embedding size. Note that when using residual connection, the Conv1DGLU does not change the number of channels, so out channels equals input channels. Returns: x (Variable): Shape(B, C_out), the output of Conv1DGLU, where C_out means the output channels of Conv1DGLU. """ residual = x_t x_t = F.dropout(x_t, self.dropout, dropout_implementation="upscale_in_train") x_t = self.conv.add_input(x_t) content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1) if speaker_embed is not None: sp = F.softsign(self.fc(speaker_embed)) content_t = F.elementwise_add(content_t, sp, axis=0) # glu x_t = F.sigmoid(gate_t) * content_t if self.residual: x_t = F.scale(x_t + residual, np.sqrt(0.5)) return x_t