encoder.py 4.7 KB
Newer Older
C
chenfeiyu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
import numpy as np
from collections import namedtuple

import paddle.fluid.layers as F
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg

from parakeet.modules.weight_norm import Conv1D, Linear
from parakeet.models.deepvoice3.conv1dglu import Conv1DGLU

ConvSpec = namedtuple("ConvSpec", ["out_channels", "filter_size", "dilation"])


class Encoder(dg.Layer):
    def __init__(self,
                 n_vocab,
                 embed_dim,
                 n_speakers,
                 speaker_dim,
                 padding_idx=None,
                 embedding_weight_std=0.1,
                 convolutions=(ConvSpec(64, 5, 1), ) * 7,
                 max_positions=512,
                 dropout=0.):
        super(Encoder, self).__init__()

        self.embedding_weight_std = embedding_weight_std
        self.embed = dg.Embedding(
            (n_vocab, embed_dim),
            padding_idx=padding_idx,
            param_attr=I.Normal(scale=embedding_weight_std))

        self.dropout = dropout
        if n_speakers > 1:
35
            std = np.sqrt((1 - dropout) / speaker_dim)
C
chenfeiyu 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
            self.sp_proj1 = Linear(speaker_dim,
                                   embed_dim,
                                   param_attr=I.Normal(scale=std))
            self.sp_proj2 = Linear(speaker_dim,
                                   embed_dim,
                                   param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers

        self.convolutions = dg.LayerList()
        in_channels = embed_dim
        std_mul = 1.0
        for (out_channels, filter_size, dilation) in convolutions:
            # 1 * 1 convolution & relu
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
                    Conv1D(in_channels,
                           out_channels,
                           1,
                           act="relu",
                           param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0

            self.convolutions.append(
                Conv1DGLU(n_speakers,
                          speaker_dim,
                          in_channels,
                          out_channels,
                          filter_size,
                          dilation,
                          std_mul,
                          dropout,
                          causal=False,
                          residual=True))
            in_channels = out_channels
            std_mul = 4.0

        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))

    def forward(self, x, speaker_embed=None):
        """
        Encode text sequence.
        
        Args:
            x (Variable): Shape(B, T_enc), dtype: int64. Ihe input text
                indices. T_enc means the timesteps of decoder input x.
            speaker_embed (Variable, optional): Shape(batch_size, speaker_dim),
                dtype: float32. Speaker embeddings. This arg is not None only
                when the model is a multispeaker model.

        Returns:
            keys (Variable), Shape(B, T_enc, C_emb), the encoded
                representation for keys, where C_emb menas the text embedding
                size.
            values (Variable), Shape(B, T_enc, C_emb), the encoded
                representation for values.
        """
        x = self.embed(x)
        x = F.dropout(x,
                      self.dropout,
                      dropout_implementation="upscale_in_train")
        x = F.transpose(x, [0, 2, 1])

        if self.n_speakers > 1 and speaker_embed is not None:
            speaker_embed = F.dropout(
                speaker_embed,
                self.dropout,
                dropout_implementation="upscale_in_train")
            x = F.elementwise_add(x,
                                  F.softsign(self.sp_proj1(speaker_embed)),
                                  axis=0)

        input_embed = x
        for layer in self.convolutions:
            if isinstance(layer, Conv1DGLU):
                x = layer(x, speaker_embed)
            else:
                # layer is a Conv1D with (1,) filter wrapped by WeightNormWrapper
                x = layer(x)

        if self.n_speakers > 1 and speaker_embed is not None:
            x = F.elementwise_add(x,
                                  F.softsign(self.sp_proj2(speaker_embed)),
                                  axis=0)

        keys = x  # (B, C, T)
        values = F.scale(input_embed + x, scale=np.sqrt(0.5))
        keys = F.transpose(keys, [0, 2, 1])
        values = F.transpose(values, [0, 2, 1])
        return keys, values