conv1dglu.py 4.7 KB
Newer Older
C
chenfeiyu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
import numpy as np

from paddle import fluid
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as F
import paddle.fluid.initializer as I

from parakeet.modules.weight_norm import Conv1D, Conv1DCell, Conv2D, Linear


class Conv1DGLU(dg.Layer):
    """
    A Convolution 1D block with GLU activation. It also applys dropout for the 
    input x. It fuses speaker embeddings through a FC activated by softsign. It
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 num_filters,
                 filter_size=1,
                 dilation=1,
                 std_mul=4.0,
                 dropout=0.0,
                 causal=False,
                 residual=True):
        super(Conv1DGLU, self).__init__()

        # conv spec
        self.in_channels = in_channels
        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.dilation = dilation

        # padding
        self.causal = causal

        # weight init and dropout
        self.std_mul = std_mul
        self.dropout = dropout

        self.residual = residual
        if residual:
            assert (
                in_channels == num_filters
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
52
        std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
C
chenfeiyu 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
        self.conv = Conv1DCell(in_channels,
                               2 * num_filters,
                               filter_size,
                               dilation,
                               causal,
                               param_attr=I.Normal(scale=std))

        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            std = np.sqrt(1 / speaker_dim)
            self.fc = Linear(speaker_dim,
                             num_filters,
                             param_attr=I.Normal(scale=std))

    def forward(self, x, speaker_embed=None):
        """
        Args:
            x (Variable): Shape(B, C_in, T), the input of Conv1DGLU
                layer, where B means batch_size, C_in means the input channels
                T means input time steps.
            speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded
                speaker embed, where C_sp means speaker embedding size. Note
                that when using residual connection, the Conv1DGLU does not
                change the number of channels, so out channels equals input
                channels.

        Returns:
            x (Variable): Shape(B, C_out, T), the output of Conv1DGLU, where
                C_out means the output channels of Conv1DGLU.
        """
        residual = x
        x = F.dropout(x,
                      self.dropout,
                      dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = F.split(x, num_or_sections=2, dim=1)

        if speaker_embed is not None:
            sp = F.softsign(self.fc(speaker_embed))
            content = F.elementwise_add(content, sp, axis=0)

        # glu
        x = F.sigmoid(gate) * content

        if self.residual:
            x = F.scale(x + residual, np.sqrt(0.5))
        return x

    def start_sequence(self):
        self.conv.start_sequence()

    def add_input(self, x_t, speaker_embed=None):
        """
        Args:
            x (Variable): Shape(B, C_in), the input of Conv1DGLU
                layer, where B means batch_size, C_in means the input channels.
            speaker_embed_bct1 (Variable): Shape(B, C_sp), expanded
                speaker embed, where C_sp means speaker embedding size. Note
                that when using residual connection, the Conv1DGLU does not
                change the number of channels, so out channels equals input
                channels.

        Returns:
            x (Variable): Shape(B, C_out), the output of Conv1DGLU, where
                C_out means the output channels of Conv1DGLU.
        """
        residual = x_t
        x_t = F.dropout(x_t,
                        self.dropout,
                        dropout_implementation="upscale_in_train")
        x_t = self.conv.add_input(x_t)
        content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)

        if speaker_embed is not None:
            sp = F.softsign(self.fc(speaker_embed))
            content_t = F.elementwise_add(content_t, sp, axis=0)

        # glu
        x_t = F.sigmoid(gate_t) * content_t

        if self.residual:
            x_t = F.scale(x_t + residual, np.sqrt(0.5))
        return x_t