converter.py 10.4 KB
Newer Older
L
lifuchen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

C
chenfeiyu 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27
import numpy as np
from itertools import chain

import paddle.fluid.layers as F
import paddle.fluid.initializer as I
import paddle.fluid.dygraph as dg

from parakeet.modules.weight_norm import Conv1D, Conv1DTranspose, Conv2D, Conv2DTranspose, Linear
from parakeet.models.deepvoice3.conv1dglu import Conv1DGLU
from parakeet.models.deepvoice3.encoder import ConvSpec


def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
28 29 30 31 32 33 34 35 36 37 38
    """Return a list of Layers that upsamples the input by 4 times in time dimension.

    Args:
        n_speakers (int): number of speakers of the Conv1DGLU layers used.
        speaker_dim (int): speaker embedding size of the Conv1DGLU layers used.
        target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.)
        dropout (float): dropout probability.

    Returns:
        List[Layer]: upsampling layers.
    """
C
chenfeiyu 已提交
39 40
    # upsampling convolitions
    upsampling_convolutions = [
41 42 43 44 45 46
        Conv1DTranspose(
            target_channels,
            target_channels,
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
L
lifuchen 已提交
47 48 49
        Conv1DGLU(
            n_speakers,
            speaker_dim,
50 51
            target_channels,
            target_channels,
L
lifuchen 已提交
52 53 54
            3,
            dilation=1,
            std_mul=1.,
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
            dropout=dropout),
        Conv1DGLU(
            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=3,
            std_mul=4.,
            dropout=dropout),
        Conv1DTranspose(
            target_channels,
            target_channels,
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
        Conv1DGLU(
            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=1,
            std_mul=1.,
            dropout=dropout),
        Conv1DGLU(
            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=3,
            std_mul=4.,
            dropout=dropout),
C
chenfeiyu 已提交
89 90 91 92 93
    ]
    return upsampling_convolutions


def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
94 95 96 97 98 99 100 101 102 103 104
    """Return a list of Layers that upsamples the input by 2 times in time dimension.

    Args:
        n_speakers (int): number of speakers of the Conv1DGLU layers used.
        speaker_dim (int): speaker embedding size of the Conv1DGLU layers used.
        target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.)
        dropout (float): dropout probability.

    Returns:
        List[Layer]: upsampling layers.
    """
C
chenfeiyu 已提交
105
    upsampling_convolutions = [
106 107 108 109 110 111
        Conv1DTranspose(
            target_channels,
            target_channels,
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
L
lifuchen 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
        Conv1DGLU(
            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=1,
            std_mul=1.,
            dropout=dropout), Conv1DGLU(
                n_speakers,
                speaker_dim,
                target_channels,
                target_channels,
                3,
                dilation=3,
                std_mul=4.,
                dropout=dropout)
C
chenfeiyu 已提交
129 130 131 132 133
    ]
    return upsampling_convolutions


def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
134 135 136 137 138 139 140 141 142 143 144
    """Return a list of Layers that upsamples the input by 1 times in time dimension.

    Args:
        n_speakers (int): number of speakers of the Conv1DGLU layers used.
        speaker_dim (int): speaker embedding size of the Conv1DGLU layers used.
        target_channels (int): channels of the input and the output.(the list of layers does not change the number of channels.)
        dropout (float): dropout probability.

    Returns:
        List[Layer]: upsampling layers.
    """
C
chenfeiyu 已提交
145
    upsampling_convolutions = [
L
lifuchen 已提交
146 147 148 149 150 151 152 153 154
        Conv1DGLU(
            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=3,
            std_mul=4.,
            dropout=dropout)
C
chenfeiyu 已提交
155 156 157 158 159 160 161 162 163 164 165 166 167
    ]
    return upsampling_convolutions


class Converter(dg.Layer):
    def __init__(self,
                 n_speakers,
                 speaker_dim,
                 in_channels,
                 linear_dim,
                 convolutions=(ConvSpec(256, 5, 1), ) * 4,
                 time_upsampling=1,
                 dropout=0.0):
C
chenfeiyu 已提交
168
        """Vocoder that transforms mel spectrogram (or ecoder hidden states) to waveform.
169 170 171 172 173 174 175 176 177 178

        Args:
            n_speakers (int): number of speakers.
            speaker_dim (int): speaker embedding size.
            in_channels (int): channels of the input.
            linear_dim (int): channels of the linear spectrogram.
            convolutions (Iterable[ConvSpec], optional): specifications of the internal convolutional layers. ConvSpec is a namedtuple of (output_channels, filter_size, dilation) Defaults to (ConvSpec(256, 5, 1), )*4.
            time_upsampling (int, optional): time upsampling factor of the converter, possible options are {1, 2, 4}. Note that this should equals the downsample factor of the mel spectrogram. Defaults to 1.
            dropout (float, optional): dropout probability. Defaults to 0.0.
        """
C
chenfeiyu 已提交
179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
        super(Converter, self).__init__()

        self.n_speakers = n_speakers
        self.speaker_dim = speaker_dim
        self.in_channels = in_channels
        self.linear_dim = linear_dim
        # CAUTION: this should equals the downsampling steps coefficient
        self.time_upsampling = time_upsampling
        self.dropout = dropout

        target_channels = convolutions[0].out_channels

        # conv proj to target channels
        self.first_conv_proj = Conv1D(
            in_channels,
            target_channels,
            1,
            param_attr=I.Normal(scale=np.sqrt(1 / in_channels)))

        # Idea from nyanko
        if time_upsampling == 4:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_4x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        elif time_upsampling == 2:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_2x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        elif time_upsampling == 1:
            self.upsampling_convolutions = dg.LayerList(
                upsampling_1x_blocks(n_speakers, speaker_dim, target_channels,
                                     dropout))
        else:
            raise ValueError(
                "Upsampling factors other than {1, 2, 4} are Not supported.")

        # post conv layers
        std_mul = 4.0
        in_channels = target_channels
        self.convolutions = dg.LayerList()
        for (out_channels, filter_size, dilation) in convolutions:
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
L
lifuchen 已提交
224 225 226 227 228 229
                    Conv1D(
                        in_channels,
                        out_channels,
                        1,
                        act="relu",
                        param_attr=I.Normal(scale=std)))
C
chenfeiyu 已提交
230 231 232
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
L
lifuchen 已提交
233 234 235 236 237 238 239 240 241
                Conv1DGLU(
                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
                    filter_size,
                    dilation=dilation,
                    std_mul=std_mul,
                    dropout=dropout))
C
chenfeiyu 已提交
242 243 244 245 246 247
            in_channels = out_channels
            std_mul = 4.0

        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
L
lifuchen 已提交
248 249 250 251 252 253
        self.last_conv_proj = Conv1D(
            in_channels,
            linear_dim,
            1,
            act="sigmoid",
            param_attr=I.Normal(scale=std))
C
chenfeiyu 已提交
254 255 256 257 258 259

    def forward(self, x, speaker_embed=None):
        """
        Convert mel spectrogram or decoder hidden states to linear spectrogram.
        
        Args:
260 261 262
            x (Variable): Shape(B, T_mel, C_in), dtype: float, converter inputs, where C_in means the input channel for the converter. Note that it can be either C_mel (channel of mel spectrogram) or C_dec // r.
                When use mel_spectrogram as the input of converter, C_in = C_mel; and when use decoder states as the input of converter, C_in = C_dec // r.
            speaker_embed (Variable, optional): shape(B, C_sp), dtype: float, speaker embedding, where C_sp means the speaker embedding size.
C
chenfeiyu 已提交
263 264

        Returns:
265
            out (Variable): Shape(B, T_lin, C_lin), the output linear spectrogram, where C_lin means the channel of linear spectrogram and T_linear means the length(time steps) of linear spectrogram. T_line = time_upsampling * T_mel, which depends on the time_upsampling of the converter.
C
chenfeiyu 已提交
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
        """
        x = F.transpose(x, [0, 2, 1])
        x = self.first_conv_proj(x)

        if speaker_embed is not None:
            speaker_embed = F.dropout(
                speaker_embed,
                self.dropout,
                dropout_implementation="upscale_in_train")

        for layer in chain(self.upsampling_convolutions, self.convolutions):
            if isinstance(layer, Conv1DGLU):
                x = layer(x, speaker_embed)
            else:
                x = layer(x)

        out = self.last_conv_proj(x)
        out = F.transpose(out, [0, 2, 1])
L
lifuchen 已提交
284
        return out