modules.py

#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
from paddle import fluid
import paddle.fluid.dygraph as dg

import numpy as np

from . import conv
from . import weight_norm


def FC(name_scope,
       in_features,
       size,
       num_flatten_dims=1,
       relu=False,
       dropout=0.0,
       epsilon=1e-30,
       act=None,
       is_test=False,
       dtype="float32"):
    """
    A special Linear Layer, when it is used with dropout, the weight is 
    initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
    """

    # stds
    if isinstance(in_features, int):
        in_features = [in_features]

    stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
    if relu:
        stds = [std * np.sqrt(2.0) for std in stds]

    weight_inits = [
        fluid.initializer.NormalInitializer(scale=std) for std in stds
    ]
    bias_init = fluid.initializer.ConstantInitializer(0.0)

    # param attrs
    weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
    bias_attr = fluid.ParamAttr(initializer=bias_init)

    layer = weight_norm.FC(name_scope,
                           size,
                           num_flatten_dims=num_flatten_dims,
                           param_attr=weight_attrs,
                           bias_attr=bias_attr,
                           act=act,
                           dtype=dtype)
    return layer


def Conv1D(name_scope,
           in_channels,
           num_filters,
           filter_size=3,
           dilation=1,
           groups=None,
           causal=False,
           std_mul=1.0,
           dropout=0.0,
           use_cudnn=True,
           act=None,
           dtype="float32"):
    """
    A special Conv1D Layer, when it is used with dropout, the weight is 
    initialized as 
    normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
    """
    # std
    std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
    weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
    bias_init = fluid.initializer.ConstantInitializer(0.0)

    # param attrs
    weight_attr = fluid.ParamAttr(initializer=weight_init)
    bias_attr = fluid.ParamAttr(initializer=bias_init)

    layer = conv.Conv1D(
        name_scope,
        in_channels,
        num_filters,
        filter_size,
        dilation,
        groups=groups,
        causal=causal,
        param_attr=weight_attr,
        bias_attr=bias_attr,
        use_cudnn=use_cudnn,
        act=act,
        dtype=dtype)
    return layer


def Embedding(name_scope,
              num_embeddings,
              embed_dim,
              is_sparse=False,
              is_distributed=False,
              padding_idx=None,
              std=0.01,
              dtype="float32"):
    # param attrs
    weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
        scale=std))
    layer = dg.Embedding(
        name_scope, (num_embeddings, embed_dim),
        padding_idx=padding_idx,
        param_attr=weight_attr,
        dtype=dtype)
    return layer


class Conv1DGLU(dg.Layer):
    """
    A Convolution 1D block with GLU activation. It also applys dropout for the 
    input x. It fuses speaker embeddings through a FC activated by softsign. It
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """

    def __init__(self,
                 name_scope,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 num_filters,
                 filter_size,
                 dilation,
                 std_mul=4.0,
                 dropout=0.0,
                 causal=False,
                 residual=True,
                 dtype="float32"):
        super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)

        # conv spec
        self.in_channels = in_channels
        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.dilation = dilation
        self.causal = causal
        self.residual = residual

        # weight init and dropout
        self.std_mul = std_mul
        self.dropout = dropout

        if residual:
            assert (
                in_channels == num_filters
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"

        self.conv = Conv1D(
            self.full_name(),
            in_channels,
            2 * num_filters,
            filter_size,
            dilation,
            causal=causal,
            std_mul=std_mul,
            dropout=dropout,
            dtype=dtype)

        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            self.fc = Conv1D(
                self.full_name(),
                speaker_dim,
                num_filters,
                filter_size=1,
                dilation=1,
                causal=False,
                act="softsign",
                dtype=dtype)

    def forward(self, x, speaker_embed_bc1t=None):
        """
        Args:
            x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
                layer, where B means batch_size, C_in means the input channels
                T means input time steps.
            speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
                speaker embed, where C_sp means speaker embedding size. Note
                that when using residual connection, the Conv1DGLU does not
                change the number of channels, so out channels equals input
                channels.

        Returns:
            x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
                C_out means the output channels of Conv1DGLU.
        """

        residual = x
        x = fluid.layers.dropout(x, self.dropout)
        x = self.conv(x)

        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)

        if speaker_embed_bc1t is not None:
            sp = self.fc(speaker_embed_bc1t)
            content = content + sp

        # glu
        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)

        if self.residual:
            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
        return x

    def add_input(self, x, speaker_embed_bc11=None):
        """
        Inputs:
        x: shape(B, num_filters, 1, time_steps)
        speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)

        Outputs:
        out: shape(B, num_filters, 1, time_steps), where time_steps = 1
        """

        residual = x

        # add step input and produce step output
        x = fluid.layers.dropout(x, self.dropout)
        x = self.conv.add_input(x)

        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)

        if speaker_embed_bc11 is not None:
            sp = self.fc(speaker_embed_bc11)
            content = content + sp

        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)

        if self.residual:
            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
        return x


def Conv1DTranspose(name_scope,
                    in_channels,
                    num_filters,
                    filter_size,
                    padding=0,
                    stride=1,
                    dilation=1,
                    groups=None,
                    std_mul=1.0,
                    dropout=0.0,
                    use_cudnn=True,
                    act=None,
                    dtype="float32"):
    std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
    weight_init = fluid.initializer.NormalInitializer(scale=std)
    weight_attr = fluid.ParamAttr(initializer=weight_init)
    bias_init = fluid.initializer.ConstantInitializer(0.0)
    bias_attr = fluid.ParamAttr(initializer=bias_init)
    layer = conv.Conv1DTranspose(
        name_scope,
        in_channels,
        num_filters,
        filter_size,
        padding=padding,
        stride=stride,
        dilation=dilation,
        groups=groups,
        param_attr=weight_attr,
        bias_attr=bias_attr,
        use_cudnn=use_cudnn,
        act=act,
        dtype=dtype)
    return layer


def compute_position_embedding(rad):
    # rad is a transposed radius, shape(embed_dim, n_vocab)
    embed_dim, n_vocab = rad.shape

    even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
    odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))

    even_rads = fluid.layers.gather(rad, even_dims)
    odd_rads = fluid.layers.gather(rad, odd_dims)

    sines = fluid.layers.sin(even_rads)
    cosines = fluid.layers.cos(odd_rads)

    temp = fluid.layers.scatter(rad, even_dims, sines)
    out = fluid.layers.scatter(temp, odd_dims, cosines)
    out = fluid.layers.transpose(out, perm=[1, 0])
    return out


def position_encoding_init(n_position,
                           d_pos_vec,
                           position_rate=1.0,
                           sinusoidal=True):
    """ Init the sinusoid position encoding table """

    # keep idx 0 for padding token position encoding zero vector
    position_enc = np.array([[
        position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
        for i in range(d_pos_vec)
    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

    if sinusoidal:
        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1

    return position_enc


class PositionEmbedding(dg.Layer):
    def __init__(self,
                 name_scope,
                 n_position,
                 d_pos_vec,
                 position_rate=1.0,
                 is_sparse=False,
                 is_distributed=False,
                 param_attr=None,
                 max_norm=None,
                 padding_idx=None,
                 dtype="float32"):
        super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
        self.embed = dg.Embedding(
            self.full_name(),
            size=(n_position, d_pos_vec),
            is_sparse=is_sparse,
            is_distributed=is_distributed,
            padding_idx=None,
            param_attr=param_attr,
            dtype=dtype)
        self.set_weight(
            position_encoding_init(
                n_position,
                d_pos_vec,
                position_rate=position_rate,
                sinusoidal=False).astype(dtype))

        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
        if self._remote_prefetch:
            assert self._is_sparse is True and self._is_distributed is False

        self._padding_idx = (-1 if padding_idx is None else padding_idx if
                             padding_idx >= 0 else (n_position + padding_idx))
        self._position_rate = position_rate
        self._max_norm = max_norm
        self._dtype = dtype

    def set_weight(self, array):
        assert self.embed._w.shape == list(array.shape), "shape does not match"
        self.embed._w._ivar.value().get_tensor().set(
            array, fluid.framework._current_expected_place())

    def forward(self, indices, speaker_position_rate=None):
        """
        Args:
            indices (Variable): Shape (B, T, 1), dtype: int64, position
                indices, where B means the batch size, T means the time steps.
            speaker_position_rate (Variable | float, optional), position
                rate. It can be a float point number or a Variable with 
                shape (1,), then this speaker_position_rate is used for every 
                example. It can also be a Variable with shape (B, 1), which 
                contains a speaker position rate for each speaker.
        Returns:
            out (Variable): Shape(B, C_pos), position embedding, where C_pos 
                means position embedding size.
        """
        rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
        batch_size = indices.shape[0]

        if speaker_position_rate is None:
            weight = compute_position_embedding(rad)
            out = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="lookup_table",
                inputs={"Ids": indices,
                        "W": weight},
                outputs={"Out": out},
                attrs={
                    "is_sparse": self._is_sparse,
                    "is_distributed": self._is_distributed,
                    "remote_prefetch": self._remote_prefetch,
                    "padding_idx":
                    self._padding_idx,  # special value for lookup table op
                })
            return out

        elif (np.isscalar(speaker_position_rate) or
              isinstance(speaker_position_rate, fluid.framework.Variable) and
              speaker_position_rate.shape == [1, 1]):
            # # make a weight
            # scale the weight (the operand for sin & cos)
            if np.isscalar(speaker_position_rate):
                scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
            else:
                scaled_rad = fluid.layers.elementwise_mul(
                    rad, speaker_position_rate[0])
            weight = compute_position_embedding(scaled_rad)
            out = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="lookup_table",
                inputs={"Ids": indices,
                        "W": weight},
                outputs={"Out": out},
                attrs={
                    "is_sparse": self._is_sparse,
                    "is_distributed": self._is_distributed,
                    "remote_prefetch": self._remote_prefetch,
                    "padding_idx":
                    self._padding_idx,  # special value for lookup table op
                })
            return out

        elif np.prod(speaker_position_rate.shape) > 1:
            assert speaker_position_rate.shape == [batch_size, 1]
            outputs = []
            for i in range(batch_size):
                rate = speaker_position_rate[i]  # rate has shape [1]
                scaled_rad = fluid.layers.elementwise_mul(rad, rate)
                weight = compute_position_embedding(scaled_rad)
                out = self._helper.create_variable_for_type_inference(
                    self._dtype)
                sequence = indices[i]
                self._helper.append_op(
                    type="lookup_table",
                    inputs={"Ids": sequence,
                            "W": weight},
                    outputs={"Out": out},
                    attrs={
                        "is_sparse": self._is_sparse,
                        "is_distributed": self._is_distributed,
                        "remote_prefetch": self._remote_prefetch,
                        "padding_idx": -1,
                    })
                outputs.append(out)
            out = fluid.layers.stack(outputs)
            return out
        else:
            raise Exception("Then you can just use position rate at init")


class Conv1D_GU(dg.Layer):
    def __init__(self,
                 name_scope,
                 conditioner_dim,
                 in_channels,
                 num_filters,
                 filter_size,
                 dilation,
                 causal=False,
                 residual=True,
                 dtype="float32"):
        super(Conv1D_GU, self).__init__(name_scope, dtype=dtype)

        self.conditioner_dim = conditioner_dim
        self.in_channels = in_channels
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.dilation = dilation
        self.causal = causal
        self.residual = residual

        if residual:
            assert (
                in_channels == num_filters
            ), "this block uses residual connection"\
                "the input_channels should equals num_filters"

        self.conv = Conv1D(
            self.full_name(),
            in_channels,
            2 * num_filters,
            filter_size,
            dilation,
            causal=causal,
            dtype=dtype)

        self.fc = Conv1D(
            self.full_name(),
            conditioner_dim,
            2 * num_filters,
            filter_size=1,
            dilation=1,
            causal=False,
            dtype=dtype)

    def forward(self, x, skip=None, conditioner=None):
        """
        Args:
            x (Variable): Shape(B, C_in, 1, T), the input of Conv1D_GU
                layer, where B means batch_size, C_in means the input channels
                T means input time steps.
            skip (Variable): Shape(B, C_in, 1, T), skip connection.
            conditioner (Variable): Shape(B, C_con, 1, T), expanded mel
                conditioner, where C_con is conditioner hidden dim which
                equals the num of mel bands. Note that when using residual
                connection, the Conv1D_GU does not change the number of
                channels, so out channels equals input channels.
        Returns:
            x (Variable): Shape(B, C_out, 1, T), the output of Conv1D_GU, where
                C_out means the output channels of Conv1D_GU.
            skip (Variable): Shape(B, C_out, 1, T), skip connection.
        """
        residual = x
        x = self.conv(x)

        if conditioner is not None:
            cond_bias = self.fc(conditioner)
            x += cond_bias

        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)

        # Gated Unit.
        x = fluid.layers.elementwise_mul(
            fluid.layers.sigmoid(gate), fluid.layers.tanh(content))

        if skip is None:
            skip = x
        else:
            skip = fluid.layers.scale(skip + x, np.sqrt(0.5))

        if self.residual:
            x = fluid.layers.scale(residual + x, np.sqrt(0.5))

        return x, skip

    def add_input(self, x, skip=None, conditioner=None):
        """
        Inputs:
            x: shape(B, num_filters, 1, time_steps)
            skip: shape(B, num_filters, 1, time_steps), skip connection
            conditioner: shape(B, conditioner_dim, 1, time_steps)
        Outputs:
            x: shape(B, num_filters, 1, time_steps), where time_steps = 1
            skip: skip connection, same shape as x
        """
        residual = x

        # add step input and produce step output
        x = self.conv.add_input(x)

        if conditioner is not None:
            cond_bias = self.fc(conditioner)
            x += cond_bias

        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)

        # Gated Unit.
        x = fluid.layers.elementwise_mul(
            fluid.layers.sigmoid(gate), fluid.layers.tanh(content))

        if skip is None:
            skip = x
        else:
            skip = fluid.layers.scale(skip + x, np.sqrt(0.5))

        if self.residual:
            x = fluid.layers.scale(residual + x, np.sqrt(0.5))

        return x, skip


def Conv2DTranspose(name_scope,
                    num_filters,
                    filter_size,
                    padding=0,
                    stride=1,
                    dilation=1,
                    use_cudnn=True,
                    act=None,
                    dtype="float32"):
    val = 1.0 / (filter_size[0] * filter_size[1])
    weight_init = fluid.initializer.ConstantInitializer(val)
    weight_attr = fluid.ParamAttr(initializer=weight_init)

    layer = weight_norm.Conv2DTranspose(
        name_scope,
        num_filters,
        filter_size=filter_size,
        padding=padding,
        stride=stride,
        dilation=dilation,
        param_attr=weight_attr,
        use_cudnn=use_cudnn,
        act=act,
        dtype=dtype)

    return layer