post_convnet.py 5.5 KB
Newer Older
L
lifuchen 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
L
lifuchen 已提交
14 15 16 17 18 19
import math
import paddle.fluid.dygraph as dg
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parakeet.modules.customized import Conv1D

L
lifuchen 已提交
20

L
lifuchen 已提交
21
class PostConvNet(dg.Layer):
L
lifuchen 已提交
22
    def __init__(self,
L
lifuchen 已提交
23 24 25 26 27 28 29 30 31
                 n_mels=80,
                 num_hidden=512,
                 filter_size=5,
                 padding=0,
                 num_conv=5,
                 outputs_per_step=1,
                 use_cudnn=True,
                 dropout=0.1,
                 batchnorm_last=False):
32 33 34 35 36 37 38 39 40 41 42 43 44
        """Decocder post conv net of TransformerTTS.

        Args:
            n_mels (int, optional): the number of mel bands when calculating mel spectrograms. Defaults to 80.
            num_hidden (int, optional): the size of hidden layer in network. Defaults to 512.
            filter_size (int, optional): the filter size of Conv.  Defaults to 5.
            padding (int, optional): the padding size of Conv. Defaults to 0.
            num_conv (int, optional): the num of Conv layers in network. Defaults to 5.
            outputs_per_step (int, optional): the num of output frames per step . Defaults to 1.
            use_cudnn (bool, optional): use cudnn in Conv or not. Defaults to True.
            dropout (float, optional): dropout probability. Defaults to 0.1.
            batchnorm_last (bool, optional): if batchnorm at last layer or not. Defaults to False.
        """
L
lifuchen 已提交
45
        super(PostConvNet, self).__init__()
L
lifuchen 已提交
46

L
lifuchen 已提交
47 48 49 50
        self.dropout = dropout
        self.num_conv = num_conv
        self.batchnorm_last = batchnorm_last
        self.conv_list = []
L
lifuchen 已提交
51
        k = math.sqrt(1.0 / (n_mels * outputs_per_step))
L
lifuchen 已提交
52 53 54 55 56 57 58 59 60 61 62 63
        self.conv_list.append(
            Conv1D(
                num_channels=n_mels * outputs_per_step,
                num_filters=num_hidden,
                filter_size=filter_size,
                padding=padding,
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.XavierInitializer()),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-k, high=k)),
                use_cudnn=use_cudnn))
L
lifuchen 已提交
64

L
lifuchen 已提交
65
        k = math.sqrt(1.0 / num_hidden)
L
lifuchen 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78
        for _ in range(1, num_conv - 1):
            self.conv_list.append(
                Conv1D(
                    num_channels=num_hidden,
                    num_filters=num_hidden,
                    filter_size=filter_size,
                    padding=padding,
                    param_attr=fluid.ParamAttr(
                        initializer=fluid.initializer.XavierInitializer()),
                    bias_attr=fluid.ParamAttr(
                        initializer=fluid.initializer.Uniform(
                            low=-k, high=k)),
                    use_cudnn=use_cudnn))
L
lifuchen 已提交
79

L
lifuchen 已提交
80 81 82 83 84 85 86 87 88 89 90 91
        self.conv_list.append(
            Conv1D(
                num_channels=num_hidden,
                num_filters=n_mels * outputs_per_step,
                filter_size=filter_size,
                padding=padding,
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.XavierInitializer()),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Uniform(
                        low=-k, high=k)),
                use_cudnn=use_cudnn))
L
lifuchen 已提交
92 93 94 95

        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)

L
lifuchen 已提交
96 97 98 99
        self.batch_norm_list = [
            dg.BatchNorm(
                num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
        ]
L
lifuchen 已提交
100
        if self.batchnorm_last:
L
lifuchen 已提交
101 102 103
            self.batch_norm_list.append(
                dg.BatchNorm(
                    n_mels * outputs_per_step, data_layout='NCHW'))
L
lifuchen 已提交
104 105 106 107 108
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)

    def forward(self, input):
        """
109
        Compute the mel spectrum.
L
lifuchen 已提交
110 111
        
        Args:
112 113
            input (Variable): shape(B, T, C), dtype float32, the result of mel linear projection. 
               
L
lifuchen 已提交
114
        Returns:
115
           output (Variable): shape(B, T, C), the result after postconvnet.
L
lifuchen 已提交
116
        """
L
lifuchen 已提交
117 118

        input = layers.transpose(input, [0, 2, 1])
L
lifuchen 已提交
119
        len = input.shape[-1]
L
lifuchen 已提交
120
        for i in range(self.num_conv - 1):
L
lifuchen 已提交
121 122
            batch_norm = self.batch_norm_list[i]
            conv = self.conv_list[i]
L
lifuchen 已提交
123 124

            input = layers.dropout(
125 126 127
                layers.tanh(batch_norm(conv(input)[:, :, :len])),
                self.dropout,
                dropout_implementation='upscale_in_train')
L
lifuchen 已提交
128 129
        conv = self.conv_list[self.num_conv - 1]
        input = conv(input)[:, :, :len]
L
lifuchen 已提交
130
        if self.batchnorm_last:
L
lifuchen 已提交
131
            batch_norm = self.batch_norm_list[self.num_conv - 1]
132 133 134 135
            input = layers.dropout(
                batch_norm(input),
                self.dropout,
                dropout_implementation='upscale_in_train')
L
lifuchen 已提交
136 137
        output = layers.transpose(input, [0, 2, 1])
        return output