# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import math import numpy as np import paddle from paddle import fluid import paddle.fluid.dygraph as dg from deepvoice3_paddle.weight_norm import Conv2D, Conv2DTranspose class Conv1D(dg.Layer): """ A convolution 1D block implemented with Conv2D. Form simplicity and ensuring the output has the same length as the input, it does not allow stride > 1. """ def __init__(self, name_scope, in_channels, num_filters, filter_size=3, dilation=1, groups=None, causal=False, param_attr=None, bias_attr=None, use_cudnn=True, act=None, dtype="float32"): super(Conv1D, self).__init__(name_scope, dtype=dtype) if causal: padding = dilation * (filter_size - 1) else: padding = (dilation * (filter_size - 1)) // 2 self.in_channels = in_channels self.num_filters = num_filters self.filter_size = filter_size self.dilation = dilation self.causal = causal self.padding = padding self.act = act self.conv = Conv2D( self.full_name(), num_filters=num_filters, filter_size=(1, filter_size), stride=(1, 1), dilation=(1, dilation), padding=(0, padding), groups=groups, param_attr=param_attr, bias_attr=bias_attr, use_cudnn=use_cudnn, act=act, dtype=dtype) def forward(self, x): """ Args: x (Variable): Shape(B, C_in, 1, T), the input, where C_in means input channels. Returns: x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means output channels (num_filters). """ x = self.conv(x) if self.filter_size > 1: if self.causal: x = fluid.layers.slice( x, axes=[3], starts=[0], ends=[-self.padding]) elif self.filter_size % 2 == 0: x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1]) return x def start_new_sequence(self): self.temp_weight = None self.input_buffer = None def add_input(self, x): """ Adding input for a time step and compute an output for a time step. Args: x (Variable): Shape(B, C_in, 1, T), the input, where C_in means input channels, and T = 1. Returns: out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means output channels (num_filters), and T = 1. """ if self.temp_weight is None: self.temp_weight = self._reshaped_weight() window_size = 1 + (self.filter_size - 1) * self.dilation batch_size = x.shape[0] in_channels = x.shape[1] if self.filter_size > 1: if self.input_buffer is None: self.input_buffer = fluid.layers.fill_constant( [batch_size, in_channels, 1, window_size - 1], dtype=x.dtype, value=0.0) else: self.input_buffer = self.input_buffer[:, :, :, 1:] self.input_buffer = fluid.layers.concat( [self.input_buffer, x], axis=3) x = self.input_buffer if self.dilation > 1: if not hasattr(self, "indices"): self.indices = dg.to_variable( np.arange(0, window_size, self.dilation)) tmp = fluid.layers.transpose( self.input_buffer, perm=[3, 1, 2, 0]) tmp = fluid.layers.gather(tmp, index=self.indices) tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0]) x = tmp inputs = fluid.layers.reshape( x, shape=[batch_size, in_channels * 1 * self.filter_size]) out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True) out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1) out = fluid.layers.reshape(out, out.shape + [1, 1]) out = self._helper.append_activation(out, act=self.act) return out def _reshaped_weight(self): """ Get the linearized weight of convolution filter, cause it is by nature a matmul weight. And because the model uses weight norm, compute the weight by weight_v * weight_g to make it faster. Returns: weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size) """ shape = self.conv._filter_param_v.shape matrix_shape = [shape[0], np.prod(shape[1:])] weight_matrix = fluid.layers.reshape( self.conv._filter_param_v, shape=matrix_shape) weight_matrix = fluid.layers.elementwise_mul( fluid.layers.l2_normalize( weight_matrix, axis=1), self.conv._filter_param_g, axis=0) return weight_matrix class Conv1DTranspose(dg.Layer): """ A convolutional transpose 1D block implemented with convolutional transpose 2D. It does not ensure that the output is exactly expanded stride times in time dimension. """ def __init__(self, name_scope, in_channels, num_filters, filter_size, padding=0, stride=1, dilation=1, groups=None, param_attr=None, bias_attr=None, use_cudnn=True, act=None, dtype="float32"): super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype) self.in_channels = in_channels self.num_filters = num_filters self.filter_size = filter_size self.padding = padding self.stride = stride self.dilation = dilation self.groups = groups self.conv_transpose = Conv2DTranspose( self.full_name(), num_filters, filter_size=(1, filter_size), padding=(0, padding), stride=(1, stride), dilation=(1, dilation), groups=groups, param_attr=param_attr, bias_attr=bias_attr, use_cudnn=use_cudnn, act=act, dtype=dtype) def forward(self, x): """ Argss: x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input channels and T_in means the number of time steps of input. Returns: out (Variable): shape(B, C_out, 1, T_out), where C_out means the output channels and T_out means the number of time steps of input. """ return self.conv_transpose(x)