network.py 9.9 KB
Newer Older
1
"""Contains DeepSpeech2 layers and networks."""
2 3 4 5 6 7 8 9 10
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import paddle.v2 as paddle


def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
                  padding, act):
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
    """Convolution layer with batch normalization.

    :param input: Input layer.
    :type input: LayerOutput
    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type filter_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :type num_channels_out: Number of output channels.
    :type num_channels_in: out
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type.
    :type act: BaseActivation
    :return: Batch norm layer after convolution layer.
    :rtype: LayerOutput
29 30 31 32 33 34 35 36 37 38
    """
    conv_layer = paddle.layer.img_conv(
        input=input,
        filter_size=filter_size,
        num_channels=num_channels_in,
        num_filters=num_channels_out,
        stride=stride,
        padding=padding,
        act=paddle.activation.Linear(),
        bias_attr=False)
39
    return paddle.layer.batch_norm(input=conv_layer, act=act)
40 41


42
def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
43
    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
44
    The batch normalization is only performed on input-state weights.
45 46 47 48 49 50 51 52 53

    :param name: Name of the layer.
    :type name: string
    :param input: Input layer.
    :type input: LayerOutput
    :param size: Number of RNN cells.
    :type size: int
    :param act: Activation type.
    :type act: BaseActivation
54 55 56
    :param share_weights: Whether to share input-hidden weights between
                          forward and backward directional RNNs.
    :type share_weights: bool
57 58
    :return: Bidirectional simple rnn layer.
    :rtype: LayerOutput
59
    """
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
    if share_weights:
        # input-hidden weights shared between bi-direcitonal rnn.
        input_proj = paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        # batch norm is only performed on input-state projection
        input_proj_bn = paddle.layer.batch_norm(
            input=input_proj, act=paddle.activation.Linear())
        # forward and backward in time
        forward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn, act=act, reverse=False)
        backward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn, act=act, reverse=True)

    else:
        input_proj_forward = paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        input_proj_backward = paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        # batch norm is only performed on input-state projection
        input_proj_bn_forward = paddle.layer.batch_norm(
            input=input_proj_forward, act=paddle.activation.Linear())
        input_proj_bn_backward = paddle.layer.batch_norm(
            input=input_proj_backward, act=paddle.activation.Linear())
        # forward and backward in time
        forward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn_forward, act=act, reverse=False)
        backward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn_backward, act=act, reverse=True)

98 99 100
    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])


X
Xinghai Sun 已提交
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
def bidirectional_gru_bn_layer(name, input, size, act):
    """Bidirectonal gru layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.

    :param name: Name of the layer.
    :type name: string
    :param input: Input layer.
    :type input: LayerOutput
    :param size: Number of RNN cells.
    :type size: int
    :param act: Activation type.
    :type act: BaseActivation
    :return: Bidirectional simple rnn layer.
    :rtype: LayerOutput
    """
116 117 118 119 120 121
    input_proj_forward = paddle.layer.fc(
        input=input,
        size=size * 3,
        act=paddle.activation.Linear(),
        bias_attr=False)
    input_proj_backward = paddle.layer.fc(
X
Xinghai Sun 已提交
122 123 124 125
        input=input,
        size=size * 3,
        act=paddle.activation.Linear(),
        bias_attr=False)
126
    # batch norm is only performed on input-related projections
127 128 129 130
    input_proj_bn_forward = paddle.layer.batch_norm(
        input=input_proj_forward, act=paddle.activation.Linear())
    input_proj_bn_backward = paddle.layer.batch_norm(
        input=input_proj_backward, act=paddle.activation.Linear())
X
Xinghai Sun 已提交
131 132
    # forward and backward in time
    forward_gru = paddle.layer.grumemory(
133
        input=input_proj_bn_forward, act=act, reverse=False)
X
Xinghai Sun 已提交
134
    backward_gru = paddle.layer.grumemory(
135
        input=input_proj_bn_backward, act=act, reverse=True)
X
Xinghai Sun 已提交
136 137 138
    return paddle.layer.concat(input=[forward_gru, backward_gru])


139
def conv_group(input, num_stacks):
140 141 142 143 144 145 146 147
    """Convolution group with stacked convolution layers.

    :param input: Input layer.
    :type input: LayerOutput
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    :return: Output layer of the convolution group.
    :rtype: LayerOutput
148 149 150 151 152 153
    """
    conv = conv_bn_layer(
        input=input,
        filter_size=(11, 41),
        num_channels_in=1,
        num_channels_out=32,
154
        stride=(3, 2),
155
        padding=(5, 20),
156
        act=paddle.activation.BRelu())
157 158 159 160 161 162 163 164
    for i in xrange(num_stacks - 1):
        conv = conv_bn_layer(
            input=conv,
            filter_size=(11, 21),
            num_channels_in=32,
            num_channels_out=32,
            stride=(1, 2),
            padding=(5, 10),
165
            act=paddle.activation.BRelu())
166 167 168 169 170
    output_num_channels = 32
    output_height = 160 // pow(2, num_stacks) + 1
    return conv, output_num_channels, output_height


171
def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
172 173 174 175 176 177 178 179
    """RNN group with stacked bidirectional simple RNN layers.

    :param input: Input layer.
    :type input: LayerOutput
    :param size: Number of RNN cells in each layer.
    :type size: int
    :param num_stacks: Number of stacked rnn layers.
    :type num_stacks: int
X
Xinghai Sun 已提交
180 181
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
182 183 184 185
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
186 187
    :return: Output layer of the RNN group.
    :rtype: LayerOutput
188 189 190
    """
    output = input
    for i in xrange(num_stacks):
X
Xinghai Sun 已提交
191 192 193 194 195
        if use_gru:
            output = bidirectional_gru_bn_layer(
                name=str(i),
                input=output,
                size=size,
X
Xinghai Sun 已提交
196
                act=paddle.activation.Relu())
197
            # BRelu does not support hppl, need to add later. Use Relu instead.
X
Xinghai Sun 已提交
198 199 200 201 202
        else:
            output = bidirectional_simple_rnn_bn_layer(
                name=str(i),
                input=output,
                size=size,
203 204
                act=paddle.activation.BRelu(),
                share_weights=share_rnn_weights)
205 206 207
    return output


208 209 210 211 212 213 214 215 216
def deep_speech_v2_network(audio_data,
                           text_data,
                           dict_size,
                           num_conv_layers=2,
                           num_rnn_layers=3,
                           rnn_size=256,
                           use_gru=False,
                           share_rnn_weights=True):
    """The DeepSpeech2 network structure.
217 218 219 220 221 222 223 224 225 226 227 228 229

    :param audio_data: Audio spectrogram data layer.
    :type audio_data: LayerOutput
    :param text_data: Transcription text data layer.
    :type text_data: LayerOutput
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
X
Xinghai Sun 已提交
230 231
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
232 233 234 235
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward direction RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
236 237
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
    :rtype: tuple of LayerOutput
    """
    # convolution group
    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
        input=audio_data, num_stacks=num_conv_layers)
    # convert data form convolution feature map to sequence of vectors
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
        num_channels=conv_group_num_channels,
        stride_x=1,
        stride_y=1,
        block_x=1,
        block_y=conv_group_height)
    # rnn group
    rnn_group_output = rnn_group(
X
Xinghai Sun 已提交
253 254 255
        input=conv2seq,
        size=rnn_size,
        num_stacks=num_rnn_layers,
256 257
        use_gru=use_gru,
        share_rnn_weights=share_rnn_weights)
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
    # probability distribution with softmax
    log_probs = paddle.layer.mixed(
        input=paddle.layer.identity_projection(input=fc),
        act=paddle.activation.Softmax())
    # ctc cost
    ctc_loss = paddle.layer.warp_ctc(
        input=fc,
        label=text_data,
        size=dict_size + 1,
        blank=dict_size,
        norm_by_times=True)
    return log_probs, ctc_loss