model.py 4.9 KB
Newer Older
1 2 3 4
"""Contains DeepSpeech2 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
X
Xinghai Sun 已提交
5

6 7 8 9 10
import paddle.v2 as paddle


def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
                  padding, act):
X
Xinghai Sun 已提交
11 12 13
    """
    Convolution layer with batch normalization.
    """
14 15 16 17 18 19 20 21 22 23 24 25
    conv_layer = paddle.layer.img_conv(
        input=input,
        filter_size=filter_size,
        num_channels=num_channels_in,
        num_filters=num_channels_out,
        stride=stride,
        padding=padding,
        act=paddle.activation.Linear(),
        bias_attr=False)
    return paddle.layer.batch_norm(input=conv_layer, act=act)


26
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
X
Xinghai Sun 已提交
27
    """
28 29
    Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
X
Xinghai Sun 已提交
30
    """
31 32 33 34 35 36 37 38 39 40 41 42
    # input-hidden weights shared across bi-direcitonal rnn.
    input_proj = paddle.layer.fc(
        input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
    # batch norm is only performed on input-state projection 
    input_proj_bn = paddle.layer.batch_norm(
        input=input_proj, act=paddle.activation.Linear())
    # forward and backward in time
    forward_simple_rnn = paddle.layer.recurrent(
        input=input_proj_bn, act=act, reverse=False)
    backward_simple_rnn = paddle.layer.recurrent(
        input=input_proj_bn, act=act, reverse=True)
    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
43 44 45


def conv_group(input, num_stacks):
X
Xinghai Sun 已提交
46 47 48
    """
    Convolution group with several stacking convolution layers.
    """
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
    conv = conv_bn_layer(
        input=input,
        filter_size=(11, 41),
        num_channels_in=1,
        num_channels_out=32,
        stride=(3, 2),
        padding=(5, 20),
        act=paddle.activation.BRelu())
    for i in xrange(num_stacks - 1):
        conv = conv_bn_layer(
            input=conv,
            filter_size=(11, 21),
            num_channels_in=32,
            num_channels_out=32,
            stride=(1, 2),
            padding=(5, 10),
            act=paddle.activation.BRelu())
66 67 68
    output_num_channels = 32
    output_height = 160 // pow(2, num_stacks) + 1
    return conv, output_num_channels, output_height
69 70 71


def rnn_group(input, size, num_stacks):
X
Xinghai Sun 已提交
72 73 74
    """
    RNN group with several stacking RNN layers.
    """
75 76
    output = input
    for i in xrange(num_stacks):
77
        output = bidirectional_simple_rnn_bn_layer(
78 79 80 81 82 83 84 85 86
            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
    return output


def deep_speech2(audio_data,
                 text_data,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
87 88
                 rnn_size=256,
                 is_inference=False):
X
Xinghai Sun 已提交
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
    """
    The whole DeepSpeech2 model structure (a simplified version).

    :param audio_data: Audio spectrogram data layer.
    :type audio_data: LayerOutput
    :param text_data: Transcription text data layer.
    :type text_data: LayerOutput
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
104 105 106 107 108 109
    :param is_inference: False in the training mode, and True in the
                         inferene mode.
    :type is_inference: bool
    :return: If is_inference set False, return a ctc cost layer;
             if is_inference set True, return a sequence layer of output
             probability distribution.
X
Xinghai Sun 已提交
110 111 112
    :rtype: tuple of LayerOutput
    """
    # convolution group
113 114
    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
        input=audio_data, num_stacks=num_conv_layers)
X
Xinghai Sun 已提交
115
    # convert data form convolution feature map to sequence of vectors
116 117
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
118
        num_channels=conv_group_num_channels,
119 120 121
        stride_x=1,
        stride_y=1,
        block_x=1,
122
        block_y=conv_group_height)
X
Xinghai Sun 已提交
123
    # rnn group
124 125 126 127 128 129 130
    rnn_group_output = rnn_group(
        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
131 132 133 134 135 136 137 138 139 140 141 142 143
    if is_inference:
        # probability distribution with softmax
        return paddle.layer.mixed(
            input=paddle.layer.identity_projection(input=fc),
            act=paddle.activation.Softmax())
    else:
        # ctc cost
        return paddle.layer.warp_ctc(
            input=fc,
            label=text_data,
            size=dict_size + 1,
            blank=dict_size,
            norm_by_times=True)