length_regulator.py 6.4 KB
Newer Older
L
lifuchen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
L
lifuchen 已提交
14 15
import numpy as np
import math
L
lifuchen 已提交
16
import parakeet.models.fastspeech.utils
L
lifuchen 已提交
17 18 19
import paddle.fluid.dygraph as dg
import paddle.fluid.layers as layers
import paddle.fluid as fluid
L
lifuchen 已提交
20
from parakeet.modules.customized import Conv1D
L
lifuchen 已提交
21

L
lifuchen 已提交
22

L
lifuchen 已提交
23 24 25
class LengthRegulator(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(LengthRegulator, self).__init__()
L
lifuchen 已提交
26 27 28 29 30
        self.duration_predictor = DurationPredictor(
            input_size=input_size,
            out_channels=out_channels,
            filter_size=filter_size,
            dropout=dropout)
L
lifuchen 已提交
31 32 33 34 35

    def LR(self, x, duration_predictor_output, alpha=1.0):
        output = []
        batch_size = x.shape[0]
        for i in range(batch_size):
L
lifuchen 已提交
36 37 38
            output.append(
                self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
                            alpha))
L
lifuchen 已提交
39 40
        output = self.pad(output)
        return output
L
lifuchen 已提交
41

L
lifuchen 已提交
42 43 44 45 46
    def pad(self, input_ele):
        max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
        out_list = []
        for i in range(len(input_ele)):
            pad_len = max_len - input_ele[i].shape[0]
L
lifuchen 已提交
47 48
            one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
                                          pad_value=0.0)
L
lifuchen 已提交
49 50 51
            out_list.append(one_batch_padded)
        out_padded = layers.stack(out_list)
        return out_padded
L
lifuchen 已提交
52

L
lifuchen 已提交
53 54 55 56
    def expand(self, batch, predicted, alpha):
        out = []
        time_steps = batch.shape[1]
        fertilities = predicted.numpy()
L
lifuchen 已提交
57 58
        batch = layers.squeeze(batch, [0])

L
lifuchen 已提交
59
        for i in range(time_steps):
L
lifuchen 已提交
60
            if fertilities[0, i] == 0:
L
lifuchen 已提交
61
                continue
L
lifuchen 已提交
62 63
            out.append(
                layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
L
lifuchen 已提交
64 65 66 67
        out = layers.concat(out, axis=0)
        return out

    def forward(self, x, alpha=1.0, target=None):
68 69 70 71 72 73 74 75 76 77 78 79 80 81
        """
        Length Regulator block in FastSpeech.
        
        Args:
            x (Variable): Shape(B, T, C), dtype: float32. The encoder output.
            alpha (Constant): dtype: float32. The hyperparameter to determine the length of 
                the expanded sequence mel, thereby controlling the voice speed.
            target (Variable): (Variable, optional): Shape(B, T_text),
                dtype: int64. The duration of phoneme compute from pretrained transformerTTS.

        Returns:
            output (Variable), Shape(B, T, C), the output after exppand.
            duration_predictor_output (Variable), Shape(B, T, C), the output of duration predictor.
        """
L
lifuchen 已提交
82 83 84 85 86 87 88
        duration_predictor_output = self.duration_predictor(x)
        if fluid.framework._dygraph_tracer()._train_mode:
            output = self.LR(x, target)
            return output, duration_predictor_output
        else:
            duration_predictor_output = layers.round(duration_predictor_output)
            output = self.LR(x, duration_predictor_output, alpha)
L
lifuchen 已提交
89
            mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
L
lifuchen 已提交
90
            mel_pos = layers.unsqueeze(mel_pos, [0])
L
lifuchen 已提交
91 92
            return output, mel_pos

L
lifuchen 已提交
93

L
lifuchen 已提交
94 95 96 97 98 99 100 101
class DurationPredictor(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(DurationPredictor, self).__init__()
        self.input_size = input_size
        self.out_channels = out_channels
        self.filter_size = filter_size
        self.dropout = dropout

L
lifuchen 已提交
102
        k = math.sqrt(1 / self.input_size)
L
lifuchen 已提交
103 104 105 106 107 108 109 110 111 112
        self.conv1 = Conv1D(
            num_channels=self.input_size,
            num_filters=self.out_channels,
            filter_size=self.filter_size,
            padding=1,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.XavierInitializer()),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-k, high=k)))
        #data_format='NTC')
L
lifuchen 已提交
113
        k = math.sqrt(1 / self.out_channels)
L
lifuchen 已提交
114 115 116 117 118 119 120 121 122 123
        self.conv2 = Conv1D(
            num_channels=self.out_channels,
            num_filters=self.out_channels,
            filter_size=self.filter_size,
            padding=1,
            param_attr=fluid.ParamAttr(
                initializer=fluid.initializer.XavierInitializer()),
            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
                low=-k, high=k)))
        #data_format='NTC')
L
lifuchen 已提交
124 125 126
        self.layer_norm1 = dg.LayerNorm(self.out_channels)
        self.layer_norm2 = dg.LayerNorm(self.out_channels)

L
lifuchen 已提交
127 128
        self.weight = fluid.ParamAttr(
            initializer=fluid.initializer.XavierInitializer())
L
lifuchen 已提交
129
        k = math.sqrt(1 / self.out_channels)
L
lifuchen 已提交
130 131
        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
            low=-k, high=k))
L
lifuchen 已提交
132

L
lifuchen 已提交
133 134
        self.linear = dg.Linear(
            self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
L
lifuchen 已提交
135 136

    def forward(self, encoder_output):
137 138
        """
        Duration Predictor block in FastSpeech.
L
lifuchen 已提交
139
        
140 141 142 143 144
        Args:
            encoder_output (Variable): Shape(B, T, C), dtype: float32. The encoder output.
        Returns:
            out (Variable), Shape(B, T, C), the output of duration predictor.
        """
L
lifuchen 已提交
145
        # encoder_output.shape(N, T, C)
L
lifuchen 已提交
146
        out = layers.transpose(encoder_output, [0, 2, 1])
L
lifuchen 已提交
147
        out = self.conv1(out)
L
lifuchen 已提交
148
        out = layers.transpose(out, [0, 2, 1])
149 150 151 152
        out = layers.dropout(
            layers.relu(self.layer_norm1(out)),
            self.dropout,
            dropout_implementation='upscale_in_train')
L
lifuchen 已提交
153
        out = layers.transpose(out, [0, 2, 1])
L
lifuchen 已提交
154
        out = self.conv2(out)
L
lifuchen 已提交
155
        out = layers.transpose(out, [0, 2, 1])
156 157 158 159
        out = layers.dropout(
            layers.relu(self.layer_norm2(out)),
            self.dropout,
            dropout_implementation='upscale_in_train')
L
lifuchen 已提交
160 161 162
        out = layers.relu(self.linear(out))
        out = layers.squeeze(out, axes=[-1])

L
lifuchen 已提交
163
        return out