crnn_ctc_model.py 7.5 KB
Newer Older
1 2 3
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
W
wanghaoshuang 已提交
4
import paddle.fluid as fluid
5 6 7
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
import math
8
import six
W
wanghaoshuang 已提交
9 10 11 12 13 14 15 16


def conv_bn_pool(input,
                 group,
                 out_ch,
                 act="relu",
                 param=None,
                 bias=None,
W
wanghaoshuang 已提交
17
                 param_0=None,
18
                 is_test=False,
19 20
                 pooling=True,
                 use_cudnn=False):
W
wanghaoshuang 已提交
21
    tmp = input
22
    for i in six.moves.xrange(group):
W
wanghaoshuang 已提交
23 24 25 26 27 28 29
        tmp = fluid.layers.conv2d(
            input=tmp,
            num_filters=out_ch[i],
            filter_size=3,
            padding=1,
            param_attr=param if param_0 is None else param_0,
            act=None,  # LinearActivation
30
            use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
31
        tmp = fluid.layers.batch_norm(
W
wanghaoshuang 已提交
32 33 34 35 36
            input=tmp,
            act=act,
            param_attr=param,
            bias_attr=bias,
            is_test=is_test)
37 38 39 40 41 42
    if pooling:
        tmp = fluid.layers.pool2d(
            input=tmp,
            pool_size=2,
            pool_type='max',
            pool_stride=2,
43
            use_cudnn=use_cudnn,
44
            ceil_mode=True)
W
wanghaoshuang 已提交
45 46 47 48

    return tmp


49 50 51 52 53
def ocr_convs(input,
              regularizer=None,
              gradient_clip=None,
              is_test=False,
              use_cudnn=False):
W
wanghaoshuang 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66
    b = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0))
    w0 = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0005))
    w1 = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.01))
    tmp = input
W
wanghaoshuang 已提交
67
    tmp = conv_bn_pool(
68 69 70 71 72 73 74
        tmp,
        2, [16, 16],
        param=w1,
        bias=b,
        param_0=w0,
        is_test=is_test,
        use_cudnn=use_cudnn)
75

76
    tmp = conv_bn_pool(
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
        tmp,
        2, [32, 32],
        param=w1,
        bias=b,
        is_test=is_test,
        use_cudnn=use_cudnn)
    tmp = conv_bn_pool(
        tmp,
        2, [64, 64],
        param=w1,
        bias=b,
        is_test=is_test,
        use_cudnn=use_cudnn)
    tmp = conv_bn_pool(
        tmp,
        2, [128, 128],
        param=w1,
        bias=b,
        is_test=is_test,
        pooling=False,
        use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
98 99 100 101 102 103 104
    return tmp


def encoder_net(images,
                num_classes,
                rnn_hidden_size=200,
                regularizer=None,
W
wanghaoshuang 已提交
105
                gradient_clip=None,
106 107
                is_test=False,
                use_cudnn=False):
W
wanghaoshuang 已提交
108
    conv_features = ocr_convs(
W
wanghaoshuang 已提交
109 110 111
        images,
        regularizer=regularizer,
        gradient_clip=gradient_clip,
112 113
        is_test=is_test,
        use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
    sliced_feature = fluid.layers.im2sequence(
        input=conv_features,
        stride=[1, 1],
        filter_size=[conv_features.shape[2], 1])

    para_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))
    bias_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02),
        learning_rate=2.0)
    bias_attr_nobias = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))

    fc_1 = fluid.layers.fc(input=sliced_feature,
                           size=rnn_hidden_size * 3,
                           param_attr=para_attr,
                           bias_attr=bias_attr_nobias)
    fc_2 = fluid.layers.fc(input=sliced_feature,
                           size=rnn_hidden_size * 3,
                           param_attr=para_attr,
                           bias_attr=bias_attr_nobias)

    gru_forward = fluid.layers.dynamic_gru(
        input=fc_1,
        size=rnn_hidden_size,
        param_attr=para_attr,
        bias_attr=bias_attr,
        candidate_activation='relu')
    gru_backward = fluid.layers.dynamic_gru(
        input=fc_2,
        size=rnn_hidden_size,
        is_reverse=True,
        param_attr=para_attr,
        bias_attr=bias_attr,
        candidate_activation='relu')

    w_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))
    b_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0))

    fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
                             size=num_classes + 1,
                             param_attr=w_attr,
                             bias_attr=b_attr)
W
wanghaoshuang 已提交
169

W
wanghaoshuang 已提交
170 171 172
    return fc_out


173
def ctc_train_net(args, data_shape, num_classes):
174 175 176
    L2_RATE = 0.0004
    LR = 1.0e-3
    MOMENTUM = 0.9
177
    learning_rate_decay = None
178
    regularizer = fluid.regularizer.L2Decay(L2_RATE)
W
wanghaoshuang 已提交
179

180 181 182
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(
        name='label', shape=[1], dtype='int32', lod_level=1)
183 184 185 186 187
    fc_out = encoder_net(
        images,
        num_classes,
        regularizer=regularizer,
        use_cudnn=True if args.use_gpu else False)
188 189 190 191 192
    cost = fluid.layers.warpctc(
        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
    sum_cost = fluid.layers.reduce_sum(cost)
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)
W
wanghaoshuang 已提交
193 194 195
    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)
W
wanghaoshuang 已提交
196
    inference_program = fluid.default_main_program().clone(for_test=True)
197 198
    if learning_rate_decay == "piecewise_decay":
        learning_rate = fluid.layers.piecewise_decay([
199
            args.total_step // 4, args.total_step // 2, args.total_step * 3 // 4
200 201 202 203 204 205
        ], [LR, LR * 0.1, LR * 0.01, LR * 0.001])
    else:
        learning_rate = LR

    optimizer = fluid.optimizer.Momentum(
        learning_rate=learning_rate, momentum=MOMENTUM)
W
wanghaoshuang 已提交
206
    _, params_grads = optimizer.minimize(sum_cost)
207 208 209 210 211 212
    model_average = None
    if args.average_window > 0:
        model_average = fluid.optimizer.ModelAverage(
            args.average_window,
            min_average_window=args.min_average_window,
            max_average_window=args.max_average_window)
213
    return sum_cost, error_evaluator, inference_program, model_average
W
wanghaoshuang 已提交
214 215


216 217
def ctc_infer(images, num_classes, use_cudnn):
    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
218 219 220
    return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)


221 222 223 224
def ctc_eval(data_shape, num_classes, use_cudnn):
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(
        name='label', shape=[1], dtype='int32', lod_level=1)
225
    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
226 227 228 229 230 231 232 233
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)

    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)

    cost = fluid.layers.warpctc(
234
        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
W
wanghaoshuang 已提交
235 236

    return error_evaluator, cost