model.py 4.8 KB
Newer Older
X
Xinghai Sun 已提交
1 2 3 4
"""
   A simplifed version of Baidu DeepSpeech2 model.
"""

5 6
import paddle.v2 as paddle

X
Xinghai Sun 已提交
7 8
#TODO: add bidirectional rnn.

9 10 11

def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
                  padding, act):
X
Xinghai Sun 已提交
12 13 14
    """
    Convolution layer with batch normalization.
    """
15 16 17 18 19 20 21 22 23 24 25 26
    conv_layer = paddle.layer.img_conv(
        input=input,
        filter_size=filter_size,
        num_channels=num_channels_in,
        num_filters=num_channels_out,
        stride=stride,
        padding=padding,
        act=paddle.activation.Linear(),
        bias_attr=False)
    return paddle.layer.batch_norm(input=conv_layer, act=act)


27
def bidirectional_simple_rnn_bn_layer(name, input, size, act):
X
Xinghai Sun 已提交
28
    """
29 30
    Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
X
Xinghai Sun 已提交
31
    """
32 33 34 35 36 37 38 39 40 41 42 43
    # input-hidden weights shared across bi-direcitonal rnn.
    input_proj = paddle.layer.fc(
        input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
    # batch norm is only performed on input-state projection 
    input_proj_bn = paddle.layer.batch_norm(
        input=input_proj, act=paddle.activation.Linear())
    # forward and backward in time
    forward_simple_rnn = paddle.layer.recurrent(
        input=input_proj_bn, act=act, reverse=False)
    backward_simple_rnn = paddle.layer.recurrent(
        input=input_proj_bn, act=act, reverse=True)
    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
44 45 46


def conv_group(input, num_stacks):
X
Xinghai Sun 已提交
47 48 49
    """
    Convolution group with several stacking convolution layers.
    """
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    conv = conv_bn_layer(
        input=input,
        filter_size=(11, 41),
        num_channels_in=1,
        num_channels_out=32,
        stride=(3, 2),
        padding=(5, 20),
        act=paddle.activation.BRelu())
    for i in xrange(num_stacks - 1):
        conv = conv_bn_layer(
            input=conv,
            filter_size=(11, 21),
            num_channels_in=32,
            num_channels_out=32,
            stride=(1, 2),
            padding=(5, 10),
            act=paddle.activation.BRelu())
67 68 69
    output_num_channels = 32
    output_height = 160 // pow(2, num_stacks) + 1
    return conv, output_num_channels, output_height
70 71 72


def rnn_group(input, size, num_stacks):
X
Xinghai Sun 已提交
73 74 75
    """
    RNN group with several stacking RNN layers.
    """
76 77
    output = input
    for i in xrange(num_stacks):
78
        output = bidirectional_simple_rnn_bn_layer(
79 80 81 82 83 84 85 86 87
            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
    return output


def deep_speech2(audio_data,
                 text_data,
                 dict_size,
                 num_conv_layers=2,
                 num_rnn_layers=3,
88 89
                 rnn_size=256,
                 is_inference=False):
X
Xinghai Sun 已提交
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
    """
    The whole DeepSpeech2 model structure (a simplified version).

    :param audio_data: Audio spectrogram data layer.
    :type audio_data: LayerOutput
    :param text_data: Transcription text data layer.
    :type text_data: LayerOutput
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
105 106 107 108 109 110
    :param is_inference: False in the training mode, and True in the
                         inferene mode.
    :type is_inference: bool
    :return: If is_inference set False, return a ctc cost layer;
             if is_inference set True, return a sequence layer of output
             probability distribution.
X
Xinghai Sun 已提交
111 112 113
    :rtype: tuple of LayerOutput
    """
    # convolution group
114 115
    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
        input=audio_data, num_stacks=num_conv_layers)
X
Xinghai Sun 已提交
116
    # convert data form convolution feature map to sequence of vectors
117 118
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
119
        num_channels=conv_group_num_channels,
120 121 122
        stride_x=1,
        stride_y=1,
        block_x=1,
123
        block_y=conv_group_height)
X
Xinghai Sun 已提交
124
    # rnn group
125 126 127 128 129 130 131
    rnn_group_output = rnn_group(
        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
132 133 134 135 136 137 138 139 140 141 142 143 144
    if is_inference:
        # probability distribution with softmax
        return paddle.layer.mixed(
            input=paddle.layer.identity_projection(input=fc),
            act=paddle.activation.Softmax())
    else:
        # ctc cost
        return paddle.layer.warp_ctc(
            input=fc,
            label=text_data,
            size=dict_size + 1,
            blank=dict_size,
            norm_by_times=True)