diff --git a/parakeet/modules/customized.py b/parakeet/modules/customized.py index e0eb65aca5160c7689d9db013f64235b8b0df829..dc2259a388222158fecb4a6ee2da0574ad07ad24 100644 --- a/parakeet/modules/customized.py +++ b/parakeet/modules/customized.py @@ -1,70 +1,7 @@ from paddle import fluid +import paddle.fluid.layers as F import paddle.fluid.dygraph as dg -class Conv1D(dg.Layer): - """ - A convolution 1D block implemented with Conv2D. Form simplicity and - ensuring the output has the same length as the input, it does not allow - stride > 1. - """ - - def __init__(self, - in_channels, - out_channels, - filter_size=3, - padding=0, - dilation=1, - stride=1, - groups=None, - param_attr=None, - bias_attr=None, - use_cudnn=True, - act=None, - data_format='NCT', - dtype="float32"): - super(Conv1D, self).__init__(dtype=dtype) - - self.padding = padding - self.in_channels = in_channels - self.num_filters = out_channels - self.filter_size = filter_size - self.stride = stride - self.dilation = dilation - self.padding = padding - self.act = act - self.data_format = data_format - - self.conv = dg.Conv2D( - num_channels=in_channels, - num_filters=out_channels, - filter_size=(1, filter_size), - stride=(1, stride), - dilation=(1, dilation), - padding=(0, padding), - groups=groups, - param_attr=param_attr, - bias_attr=bias_attr, - use_cudnn=use_cudnn, - act=act, - dtype=dtype) - - def forward(self, x): - """ - Args: - x (Variable): Shape(B, C_in, 1, T), the input, where C_in means - input channels. - Returns: - x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means - output channels (num_filters). - """ - if self.data_format == 'NTC': - x = fluid.layers.transpose(x, [0, 2, 1]) - x = fluid.layers.unsqueeze(x, [2]) - x = self.conv(x) - x = fluid.layers.squeeze(x, [2]) - if self.data_format == 'NTC': - x = fluid.layers.transpose(x, [0, 2, 1]) - return x class Pool1D(dg.Layer): """ @@ -115,3 +52,156 @@ class Pool1D(dg.Layer): if self.data_format == 'NTC': x = fluid.layers.transpose(x, [0, 2, 1]) return x + +class Conv1D(dg.Conv2D): + """A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and + use (B, C, 1, T) data layout to compute 1D convolution. Nothing more. + NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple + layer, instead of a complex one. So we can easily apply weight norm to it. + """ + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + padding=0, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + dtype='float32'): + super(Conv1D, self).__init__(num_channels, + num_filters, (1, filter_size), + stride=(1, stride), + padding=(0, padding), + dilation=(1, dilation), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + x = F.unsqueeze(x, [2]) + x = super(Conv1D, self).forward(x) # maybe risky here + x = F.squeeze(x, [2]) + return x + + +class Conv1DTranspose(dg.Conv2DTranspose): + def __init__(self, + num_channels, + num_filters, + filter_size, + padding=0, + stride=1, + dilation=1, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + dtype='float32'): + super(Conv1DTranspose, self).__init__(num_channels, + num_filters, (1, filter_size), + output_size=None, + padding=(0, padding), + stride=(1, stride), + dilation=(1, dilation), + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + x = F.unsqueeze(x, [2]) + x = super(Conv1DTranspose, self).forward(x) # maybe risky here + x = F.squeeze(x, [2]) + return x + + +class Conv1DCell(Conv1D): + """A causal convolve-1d cell. It uses causal padding, padding(receptive_field -1, 0). + But Conv2D in dygraph does not support asymmetric padding yet, we just pad + (receptive_field -1, receptive_field -1) and drop last receptive_field -1 steps in + the output. + + It is a cell that it acts like an RNN cell. It does not support stride > 1, and it + ensures 1-to-1 mapping from input time steps to output timesteps. + """ + def __init__(self, + num_channels, + num_filters, + filter_size, + dilation=1, + causal=False, + groups=None, + param_attr=None, + bias_attr=None, + use_cudnn=True, + act=None, + dtype='float32'): + receptive_field = 1 + dilation * (filter_size - 1) + padding = receptive_field - 1 if causal else receptive_field // 2 + self._receptive_field = receptive_field + self.causal = causal + super(Conv1DCell, self).__init__(num_channels, + num_filters, + filter_size, + stride=1, + padding=padding, + dilation=dilation, + groups=groups, + param_attr=param_attr, + bias_attr=bias_attr, + use_cudnn=use_cudnn, + act=act, + dtype=dtype) + + def forward(self, x): + # it ensures that ouput time steps == input time steps + time_steps = x.shape[-1] + x = super(Conv1DCell, self).forward(x) + if x.shape[-1] != time_steps: + x = x[:, :, :time_steps] + return x + + @property + def receptive_field(self): + return self._receptive_field + + def start_sequence(self): + if not self.causal: + raise ValueError( + "Only causal conv1d shell should use start sequence") + if self.receptive_field == 1: + raise ValueError( + "Convolution block with receptive field = 1 does not need" + " to be implemented as a Conv1DCell. Conv1D suffices") + self._buffer = None + self._reshaped_weight = F.reshape(self.weight, (self._num_filters, -1)) + + def add_input(self, x_t): + batch_size, c_in, _ = x_t.shape + if self._buffer is None: + self._buffer = F.zeros((batch_size, c_in, self.receptive_field), + dtype=x_t.dtype) + self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1) + if self._dilation[1] > 1: + input = F.strided_slice(self._buffer, + axes=[2], + starts=[0], + ends=[self.receptive_field], + strides=[self._dilation[1]]) + else: + input = self._buffer + input = F.reshape(input, (batch_size, -1)) + y_t = F.matmul(input, self._reshaped_weight, transpose_y=True) + y_t = y_t + self.bias + y_t = F.unsqueeze(y_t, [-1]) + return y_t