wavenet_modules.py 15.2 KB
Newer Older
L
lifuchen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

K
Kexin Zhao 已提交
15 16 17 18
import itertools

import numpy as np
import paddle.fluid.dygraph as dg
K
Kexin Zhao 已提交
19 20
from paddle import fluid
from parakeet.modules import conv, modules
K
Kexin Zhao 已提交
21 22 23 24 25 26 27 28 29 30 31 32


def get_padding(filter_size, stride, padding_type='same'):
    if padding_type == 'same':
        padding = [(x - y) // 2 for x, y in zip(filter_size, stride)]
    else:
        raise ValueError("Only support same padding")
    return padding


def extract_slices(x, audio_starts, audio_length, rank):
    slices = []
L
lifuchen 已提交
33
    for i in range(x.shape[0]):
K
Kexin Zhao 已提交
34 35 36
        start = audio_starts.numpy()[i]
        end = start + audio_length
        slice = fluid.layers.slice(
L
lifuchen 已提交
37
            x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
K
Kexin Zhao 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
        slices.append(fluid.layers.squeeze(slice, [0]))

    x = fluid.layers.stack(slices, axis=0)

    return x


class Conditioner(dg.Layer):
    def __init__(self, name_scope, config):
        super(Conditioner, self).__init__(name_scope)
        upsample_factors = config.conditioner.upsample_factors
        filter_sizes = config.conditioner.filter_sizes
        assert np.prod(upsample_factors) == config.fft_window_shift

        self.deconvs = []
        for i, up_scale in enumerate(upsample_factors):
            stride = (up_scale, 1)
            padding = get_padding(filter_sizes[i], stride)
            self.deconvs.append(
K
Kexin Zhao 已提交
57
                modules.Conv2DTranspose(
K
Kexin Zhao 已提交
58 59 60 61 62 63 64 65 66
                    self.full_name(),
                    num_filters=1,
                    filter_size=filter_sizes[i],
                    padding=padding,
                    stride=stride))

        # Register python list as parameters.
        for i, layer in enumerate(self.deconvs):
            self.add_sublayer("conv_transpose_{}".format(i), layer)
L
lifuchen 已提交
67

K
Kexin Zhao 已提交
68 69 70 71 72 73 74 75 76 77 78
    def forward(self, x):
        x = fluid.layers.unsqueeze(x, 1)
        for layer in self.deconvs:
            x = fluid.layers.leaky_relu(layer(x), alpha=0.4)

        return fluid.layers.squeeze(x, [1])


class WaveNetModule(dg.Layer):
    def __init__(self, name_scope, config, rank):
        super(WaveNetModule, self).__init__(name_scope)
L
lifuchen 已提交
79

K
Kexin Zhao 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92
        self.rank = rank
        self.conditioner = Conditioner(self.full_name(), config)
        self.dilations = list(
            itertools.islice(
                itertools.cycle(config.dilation_block), config.layers))
        self.context_size = sum(self.dilations) + 1
        self.log_scale_min = config.log_scale_min
        self.config = config

        print("dilations", self.dilations)
        print("context_size", self.context_size)

        if config.loss_type == "softmax":
K
Kexin Zhao 已提交
93
            self.embedding_fc = modules.Embedding(
K
Kexin Zhao 已提交
94 95
                self.full_name(),
                num_embeddings=config.num_channels,
K
Kexin Zhao 已提交
96 97
                embed_dim=config.residual_channels,
                std=0.1)
K
Kexin Zhao 已提交
98
        elif config.loss_type == "mix-gaussian-pdf":
L
lifuchen 已提交
99 100 101 102 103
            self.embedding_fc = modules.FC(self.full_name(),
                                           in_features=1,
                                           size=config.residual_channels,
                                           num_flatten_dims=2,
                                           relu=False)
K
Kexin Zhao 已提交
104
        else:
L
lifuchen 已提交
105
            raise ValueError("loss_type {} is unsupported!".format(loss_type))
K
Kexin Zhao 已提交
106 107 108 109

        self.dilated_causal_convs = []
        for dilation in self.dilations:
            self.dilated_causal_convs.append(
K
Kexin Zhao 已提交
110
                modules.Conv1D_GU(
K
Kexin Zhao 已提交
111 112 113 114 115 116
                    self.full_name(),
                    conditioner_dim=config.mel_bands,
                    in_channels=config.residual_channels,
                    num_filters=config.residual_channels,
                    filter_size=config.kernel_width,
                    dilation=dilation,
L
lifuchen 已提交
117
                    causal=True))
K
Kexin Zhao 已提交
118 119

        for i, layer in enumerate(self.dilated_causal_convs):
L
lifuchen 已提交
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
            self.add_sublayer("dilated_causal_conv_{}".format(i), layer)

        self.fc1 = modules.FC(self.full_name(),
                              in_features=config.residual_channels,
                              size=config.skip_channels,
                              num_flatten_dims=2,
                              relu=True,
                              act="relu")

        self.fc2 = modules.FC(self.full_name(),
                              in_features=config.skip_channels,
                              size=config.skip_channels,
                              num_flatten_dims=2,
                              relu=True,
                              act="relu")
K
Kexin Zhao 已提交
135 136

        if config.loss_type == "softmax":
L
lifuchen 已提交
137 138 139 140 141
            self.fc3 = modules.FC(self.full_name(),
                                  in_features=config.skip_channels,
                                  size=config.num_channels,
                                  num_flatten_dims=2,
                                  relu=False)
K
Kexin Zhao 已提交
142
        elif config.loss_type == "mix-gaussian-pdf":
L
lifuchen 已提交
143 144 145 146 147
            self.fc3 = modules.FC(self.full_name(),
                                  in_features=config.skip_channels,
                                  size=3 * config.num_mixtures,
                                  num_flatten_dims=2,
                                  relu=False)
K
Kexin Zhao 已提交
148
        else:
L
lifuchen 已提交
149
            raise ValueError("loss_type {} is unsupported!".format(loss_type))
K
Kexin Zhao 已提交
150 151 152 153

    def sample_softmax(self, mix_parameters):
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
L
lifuchen 已提交
154
                                            [batch * length, hidden])
K
Kexin Zhao 已提交
155 156 157
        mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)

        # quantized: [batch * length]
L
lifuchen 已提交
158 159
        quantized = fluid.layers.cast(
            fluid.layers.sampling_id(mix_param_2d), dtype="float32")
K
Kexin Zhao 已提交
160 161 162 163 164 165
        samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0

        # samples: [batch * length]
        return samples

    def sample_mix_gaussian(self, mix_parameters):
K
Kexin Zhao 已提交
166 167
        # mix_parameters reshape from [bs, len, 3 * num_mixtures]
        # to [bs * len, 3 * num_mixtures].
K
Kexin Zhao 已提交
168 169
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
L
lifuchen 已提交
170
                                            [batch * length, hidden])
K
Kexin Zhao 已提交
171 172 173
        K = hidden // 3

        # Unpack the parameters of the mixture of gaussian.
L
lifuchen 已提交
174 175 176
        logits_pi = mix_param_2d[:, 0:K]
        mu = mix_param_2d[:, K:2 * K]
        log_s = mix_param_2d[:, 2 * K:3 * K]
K
Kexin Zhao 已提交
177 178 179 180
        s = fluid.layers.exp(log_s)

        pi = fluid.layers.softmax(logits_pi, axis=-1)
        comp_samples = fluid.layers.sampling_id(pi)
L
lifuchen 已提交
181

K
Kexin Zhao 已提交
182 183 184 185
        row_idx = dg.to_variable(np.arange(batch * length))
        comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)

        mu_comp = fluid.layers.gather_nd(mu, comp_samples)
L
lifuchen 已提交
186
        s_comp = fluid.layers.gather_nd(s, comp_samples)
K
Kexin Zhao 已提交
187

K
Kexin Zhao 已提交
188
        # N(0, 1) normal sample.
K
Kexin Zhao 已提交
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
        u = fluid.layers.gaussian_random(shape=[batch * length])
        samples = mu_comp + u * s_comp
        samples = fluid.layers.clip(samples, min=-1.0, max=1.0)

        return samples

    def softmax_loss(self, targets, mix_parameters):
        targets = targets[:, self.context_size:]
        mix_parameters = mix_parameters[:, self.context_size:, :]

        # Quantized audios to integral values with range [0, num_channels)
        num_channels = self.config.num_channels
        targets = fluid.layers.clip(targets, min=-1.0, max=0.99999)
        quantized = fluid.layers.cast(
            (targets + 1.0) / 2.0 * num_channels, dtype="int64")

K
Kexin Zhao 已提交
205
        # per_sample_loss shape: [bs, len, 1]
K
Kexin Zhao 已提交
206 207 208 209 210 211 212
        per_sample_loss = fluid.layers.softmax_with_cross_entropy(
            logits=mix_parameters, label=fluid.layers.unsqueeze(quantized, 2))
        loss = fluid.layers.reduce_mean(per_sample_loss)

        return loss

    def mixture_density_loss(self, targets, mix_parameters, log_scale_min):
K
Kexin Zhao 已提交
213 214
        # targets: [bs, len]
        # mix_params: [bs, len, 3 * num_mixture]
K
Kexin Zhao 已提交
215 216 217
        targets = targets[:, self.context_size:]
        mix_parameters = mix_parameters[:, self.context_size:, :]

K
Kexin Zhao 已提交
218 219 220
        # log_s: [bs, len, num_mixture]
        logits_pi, mu, log_s = fluid.layers.split(
            mix_parameters, num_or_sections=3, dim=-1)
K
Kexin Zhao 已提交
221 222 223 224 225 226 227

        pi = fluid.layers.softmax(logits_pi, axis=-1)
        log_s = fluid.layers.clip(log_s, min=log_scale_min, max=100.0)
        inv_s = fluid.layers.exp(0.0 - log_s)

        # Calculate gaussian loss.
        targets = fluid.layers.unsqueeze(targets, -1)
L
lifuchen 已提交
228 229 230
        targets = fluid.layers.expand(targets,
                                      [1, 1, self.config.num_mixtures])
        x_std = inv_s * (targets - mu)
K
Kexin Zhao 已提交
231 232 233
        exponent = fluid.layers.exp(-0.5 * x_std * x_std)
        pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
        pdf_x = pi * pdf_x
K
Kexin Zhao 已提交
234
        # pdf_x: [bs, len]
K
Kexin Zhao 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247
        pdf_x = fluid.layers.reduce_sum(pdf_x, dim=-1)
        per_sample_loss = 0.0 - fluid.layers.log(pdf_x + 1e-9)

        loss = fluid.layers.reduce_mean(per_sample_loss)

        return loss

    def forward(self, audios, mels, audio_starts, sample=False):
        # Build conditioner based on mels.
        full_conditioner = self.conditioner(mels)

        # Slice conditioners.
        audio_length = audios.shape[1]
L
lifuchen 已提交
248 249 250
        conditioner = extract_slices(full_conditioner, audio_starts,
                                     audio_length, self.rank)

K
Kexin Zhao 已提交
251
        # input_audio, target_audio: [bs, len]
K
Kexin Zhao 已提交
252 253
        input_audios = audios[:, :-1]
        target_audios = audios[:, 1:]
K
Kexin Zhao 已提交
254
        # conditioner: [bs, len, mel_bands]
K
Kexin Zhao 已提交
255 256 257 258 259 260 261 262 263 264 265
        conditioner = conditioner[:, 1:, :]

        loss_type = self.config.loss_type

        if loss_type == "softmax":
            input_audios = fluid.layers.clip(
                input_audios, min=-1.0, max=0.99999)
            # quantized have values in [0, num_channels)
            quantized = fluid.layers.cast(
                (input_audios + 1.0) / 2.0 * self.config.num_channels,
                dtype="int64")
K
Kexin Zhao 已提交
266 267
            layer_input = self.embedding_fc(
                fluid.layers.unsqueeze(quantized, 2))
K
Kexin Zhao 已提交
268
        elif loss_type == "mix-gaussian-pdf":
K
Kexin Zhao 已提交
269 270
            layer_input = self.embedding_fc(
                fluid.layers.unsqueeze(input_audios, 2))
K
Kexin Zhao 已提交
271
        else:
L
lifuchen 已提交
272
            raise ValueError("loss_type {} is unsupported!".format(loss_type))
K
Kexin Zhao 已提交
273

K
Kexin Zhao 已提交
274 275
        # layer_input: [bs, res_channel, 1, len]
        layer_input = fluid.layers.unsqueeze(
L
lifuchen 已提交
276 277
            fluid.layers.transpose(
                layer_input, perm=[0, 2, 1]), 2)
K
Kexin Zhao 已提交
278 279
        # conditioner: [bs, mel_bands, 1, len]
        conditioner = fluid.layers.unsqueeze(
L
lifuchen 已提交
280 281
            fluid.layers.transpose(
                conditioner, perm=[0, 2, 1]), 2)
K
Kexin Zhao 已提交
282 283 284

        skip = None
        for i, layer in enumerate(self.dilated_causal_convs):
K
Kexin Zhao 已提交
285 286
            # layer_input: [bs, res_channel, 1, len]
            # skip: [bs, res_channel, 1, len]
K
Kexin Zhao 已提交
287 288
            layer_input, skip = layer(layer_input, skip, conditioner)

K
Kexin Zhao 已提交
289 290 291
        # Reshape skip to [bs, len, res_channel]
        skip = fluid.layers.transpose(
            fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
K
Kexin Zhao 已提交
292 293 294 295 296 297 298 299 300 301
        mix_parameters = self.fc3(self.fc2(self.fc1(skip)))

        # Sample teacher-forced audio.
        sample_audios = None
        if sample:
            if loss_type == "softmax":
                sample_audios = self.sample_softmax(mix_parameters)
            elif loss_type == "mix-gaussian-pdf":
                sample_audios = self.sample_mix_gaussian(mix_parameters)
            else:
L
lifuchen 已提交
302 303
                raise ValueError("loss_type {} is unsupported!".format(
                    loss_type))
K
Kexin Zhao 已提交
304 305 306 307

        if loss_type == "softmax":
            loss = self.softmax_loss(target_audios, mix_parameters)
        elif loss_type == "mix-gaussian-pdf":
L
lifuchen 已提交
308 309
            loss = self.mixture_density_loss(target_audios, mix_parameters,
                                             self.log_scale_min)
K
Kexin Zhao 已提交
310
        else:
L
lifuchen 已提交
311
            raise ValueError("loss_type {} is unsupported!".format(loss_type))
K
Kexin Zhao 已提交
312 313 314 315 316

        return loss, sample_audios

    def synthesize(self, mels):
        self.start_new_sequence()
L
lifuchen 已提交
317
        bs, n_frames, mel_bands = mels.shape
K
Kexin Zhao 已提交
318 319
        conditioner = self.conditioner(mels)
        time_steps = conditioner.shape[1]
K
Kexin Zhao 已提交
320 321 322

        print("input mels shape", mels.shape)
        print("Total synthesis steps", time_steps)
K
Kexin Zhao 已提交
323 324 325 326 327 328 329 330

        loss_type = self.config.loss_type
        audio_samples = []
        current_sample = fluid.layers.zeros(shape=[bs, 1, 1], dtype="float32")
        for i in range(time_steps):
            if i % 100 == 0:
                print("Step", i)

K
Kexin Zhao 已提交
331 332
            # Convert from real value sample to audio embedding.
            # audio_input: [bs, 1, channel]
K
Kexin Zhao 已提交
333 334 335 336 337 338 339 340 341 342 343
            if loss_type == "softmax":
                current_sample = fluid.layers.clip(
                    current_sample, min=-1.0, max=0.99999)
                # quantized have values in [0, num_channels)
                quantized = fluid.layers.cast(
                    (current_sample + 1.0) / 2.0 * self.config.num_channels,
                    dtype="int64")
                audio_input = self.embedding_fc(quantized)
            elif loss_type == "mix-gaussian-pdf":
                audio_input = self.embedding_fc(current_sample)
            else:
L
lifuchen 已提交
344 345
                raise ValueError("loss_type {} is unsupported!".format(
                    loss_type))
K
Kexin Zhao 已提交
346

K
Kexin Zhao 已提交
347 348
            # [bs, channel, 1, 1]
            audio_input = fluid.layers.unsqueeze(
L
lifuchen 已提交
349 350
                fluid.layers.transpose(
                    audio_input, perm=[0, 2, 1]), 2)
K
Kexin Zhao 已提交
351
            # [bs, mel_bands]
K
Kexin Zhao 已提交
352
            cond_input = conditioner[:, i, :]
K
Kexin Zhao 已提交
353
            # [bs, mel_bands, 1, 1]
L
lifuchen 已提交
354 355
            cond_input = fluid.layers.reshape(cond_input,
                                              cond_input.shape + [1, 1])
K
Kexin Zhao 已提交
356 357 358

            skip = None
            for layer in self.dilated_causal_convs:
L
lifuchen 已提交
359 360 361
                audio_input, skip = layer.add_input(audio_input, skip,
                                                    cond_input)

K
Kexin Zhao 已提交
362 363 364
            # [bs, 1, channel]
            skip = fluid.layers.transpose(
                fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
K
Kexin Zhao 已提交
365 366 367 368 369 370
            mix_parameters = self.fc3(self.fc2(self.fc1(skip)))
            if loss_type == "softmax":
                sample = self.sample_softmax(mix_parameters)
            elif loss_type == "mix-gaussian-pdf":
                sample = self.sample_mix_gaussian(mix_parameters)
            else:
L
lifuchen 已提交
371 372
                raise ValueError("loss_type {} is unsupported!".format(
                    loss_type))
K
Kexin Zhao 已提交
373 374 375 376
            audio_samples.append(sample)
            # [bs]
            current_sample = audio_samples[-1]
            # [bs, 1, 1]
L
lifuchen 已提交
377 378
            current_sample = fluid.layers.reshape(
                current_sample, current_sample.shape + [1, 1])
K
Kexin Zhao 已提交
379

K
Kexin Zhao 已提交
380
        # syn_audio: [num_samples]
K
Kexin Zhao 已提交
381 382
        syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()

L
lifuchen 已提交
383
        return syn_audio
K
Kexin Zhao 已提交
384 385 386

    def start_new_sequence(self):
        for layer in self.sublayers():
K
Kexin Zhao 已提交
387
            if isinstance(layer, conv.Conv1D):
K
Kexin Zhao 已提交
388
                layer.start_new_sequence()