crnn_ctc_model.py 7.3 KB
Newer Older
W
wanghaoshuang 已提交
1
import paddle.fluid as fluid
2 3 4
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
import math
W
wanghaoshuang 已提交
5 6 7 8 9 10 11 12


def conv_bn_pool(input,
                 group,
                 out_ch,
                 act="relu",
                 param=None,
                 bias=None,
W
wanghaoshuang 已提交
13
                 param_0=None,
14
                 is_test=False,
15 16
                 pooling=True,
                 use_cudnn=False):
W
wanghaoshuang 已提交
17 18 19 20 21 22 23 24 25
    tmp = input
    for i in xrange(group):
        tmp = fluid.layers.conv2d(
            input=tmp,
            num_filters=out_ch[i],
            filter_size=3,
            padding=1,
            param_attr=param if param_0 is None else param_0,
            act=None,  # LinearActivation
26
            use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
27
        tmp = fluid.layers.batch_norm(
W
wanghaoshuang 已提交
28 29 30 31 32
            input=tmp,
            act=act,
            param_attr=param,
            bias_attr=bias,
            is_test=is_test)
33 34 35 36 37 38
    if pooling:
        tmp = fluid.layers.pool2d(
            input=tmp,
            pool_size=2,
            pool_type='max',
            pool_stride=2,
39
            use_cudnn=use_cudnn,
40
            ceil_mode=True)
W
wanghaoshuang 已提交
41 42 43 44

    return tmp


45 46 47 48 49
def ocr_convs(input,
              regularizer=None,
              gradient_clip=None,
              is_test=False,
              use_cudnn=False):
W
wanghaoshuang 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62
    b = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0))
    w0 = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0005))
    w1 = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.01))
    tmp = input
W
wanghaoshuang 已提交
63
    tmp = conv_bn_pool(
64 65 66 67 68 69 70
        tmp,
        2, [16, 16],
        param=w1,
        bias=b,
        param_0=w0,
        is_test=is_test,
        use_cudnn=use_cudnn)
71

72
    tmp = conv_bn_pool(
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
        tmp,
        2, [32, 32],
        param=w1,
        bias=b,
        is_test=is_test,
        use_cudnn=use_cudnn)
    tmp = conv_bn_pool(
        tmp,
        2, [64, 64],
        param=w1,
        bias=b,
        is_test=is_test,
        use_cudnn=use_cudnn)
    tmp = conv_bn_pool(
        tmp,
        2, [128, 128],
        param=w1,
        bias=b,
        is_test=is_test,
        pooling=False,
        use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
94 95 96 97 98 99 100
    return tmp


def encoder_net(images,
                num_classes,
                rnn_hidden_size=200,
                regularizer=None,
W
wanghaoshuang 已提交
101
                gradient_clip=None,
102 103
                is_test=False,
                use_cudnn=False):
W
wanghaoshuang 已提交
104
    conv_features = ocr_convs(
W
wanghaoshuang 已提交
105 106 107
        images,
        regularizer=regularizer,
        gradient_clip=gradient_clip,
108 109
        is_test=is_test,
        use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    sliced_feature = fluid.layers.im2sequence(
        input=conv_features,
        stride=[1, 1],
        filter_size=[conv_features.shape[2], 1])

    para_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))
    bias_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02),
        learning_rate=2.0)
    bias_attr_nobias = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))

    fc_1 = fluid.layers.fc(input=sliced_feature,
                           size=rnn_hidden_size * 3,
                           param_attr=para_attr,
                           bias_attr=bias_attr_nobias)
    fc_2 = fluid.layers.fc(input=sliced_feature,
                           size=rnn_hidden_size * 3,
                           param_attr=para_attr,
                           bias_attr=bias_attr_nobias)

    gru_forward = fluid.layers.dynamic_gru(
        input=fc_1,
        size=rnn_hidden_size,
        param_attr=para_attr,
        bias_attr=bias_attr,
        candidate_activation='relu')
    gru_backward = fluid.layers.dynamic_gru(
        input=fc_2,
        size=rnn_hidden_size,
        is_reverse=True,
        param_attr=para_attr,
        bias_attr=bias_attr,
        candidate_activation='relu')

    w_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))
    b_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0))

    fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
                             size=num_classes + 1,
                             param_attr=w_attr,
                             bias_attr=b_attr)
W
wanghaoshuang 已提交
165

W
wanghaoshuang 已提交
166 167 168
    return fc_out


169
def ctc_train_net(args, data_shape, num_classes):
170 171 172
    L2_RATE = 0.0004
    LR = 1.0e-3
    MOMENTUM = 0.9
173
    learning_rate_decay = None
174
    regularizer = fluid.regularizer.L2Decay(L2_RATE)
W
wanghaoshuang 已提交
175

176 177 178
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(
        name='label', shape=[1], dtype='int32', lod_level=1)
179 180 181 182 183
    fc_out = encoder_net(
        images,
        num_classes,
        regularizer=regularizer,
        use_cudnn=True if args.use_gpu else False)
184 185 186 187 188
    cost = fluid.layers.warpctc(
        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
    sum_cost = fluid.layers.reduce_sum(cost)
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)
W
wanghaoshuang 已提交
189 190 191
    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)
W
wanghaoshuang 已提交
192
    inference_program = fluid.default_main_program().clone(for_test=True)
193 194 195 196 197 198 199 200 201
    if learning_rate_decay == "piecewise_decay":
        learning_rate = fluid.layers.piecewise_decay([
            args.total_step / 4, args.total_step / 2, args.total_step * 3 / 4
        ], [LR, LR * 0.1, LR * 0.01, LR * 0.001])
    else:
        learning_rate = LR

    optimizer = fluid.optimizer.Momentum(
        learning_rate=learning_rate, momentum=MOMENTUM)
W
wanghaoshuang 已提交
202
    _, params_grads = optimizer.minimize(sum_cost)
203 204 205 206 207 208
    model_average = None
    if args.average_window > 0:
        model_average = fluid.optimizer.ModelAverage(
            args.average_window,
            min_average_window=args.min_average_window,
            max_average_window=args.max_average_window)
209
    return sum_cost, error_evaluator, inference_program, model_average
W
wanghaoshuang 已提交
210 211


212 213
def ctc_infer(images, num_classes, use_cudnn):
    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
214 215 216
    return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)


217 218 219 220
def ctc_eval(data_shape, num_classes, use_cudnn):
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(
        name='label', shape=[1], dtype='int32', lod_level=1)
221
    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
222 223 224 225 226 227 228 229
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)

    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)

    cost = fluid.layers.warpctc(
230
        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
W
wanghaoshuang 已提交
231 232

    return error_evaluator, cost