crnn_ctc_model.py 7.0 KB
Newer Older
W
wanghaoshuang 已提交
1
import paddle.fluid as fluid
2 3 4
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
import math
W
wanghaoshuang 已提交
5 6 7 8 9 10 11 12


def conv_bn_pool(input,
                 group,
                 out_ch,
                 act="relu",
                 param=None,
                 bias=None,
W
wanghaoshuang 已提交
13
                 param_0=None,
14
                 is_test=False,
15 16
                 pooling=True,
                 use_cudnn=False):
W
wanghaoshuang 已提交
17 18 19 20 21 22 23 24 25
    tmp = input
    for i in xrange(group):
        tmp = fluid.layers.conv2d(
            input=tmp,
            num_filters=out_ch[i],
            filter_size=3,
            padding=1,
            param_attr=param if param_0 is None else param_0,
            act=None,  # LinearActivation
26
            use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
27
        tmp = fluid.layers.batch_norm(
W
wanghaoshuang 已提交
28 29 30 31 32
            input=tmp,
            act=act,
            param_attr=param,
            bias_attr=bias,
            is_test=is_test)
33 34 35 36 37 38
    if pooling:
        tmp = fluid.layers.pool2d(
            input=tmp,
            pool_size=2,
            pool_type='max',
            pool_stride=2,
39
            use_cudnn=use_cudnn,
40
            ceil_mode=True)
W
wanghaoshuang 已提交
41 42 43 44

    return tmp


45 46 47 48 49
def ocr_convs(input,
              regularizer=None,
              gradient_clip=None,
              is_test=False,
              use_cudnn=False):
W
wanghaoshuang 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62
    b = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0))
    w0 = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0005))
    w1 = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.01))
    tmp = input
W
wanghaoshuang 已提交
63
    tmp = conv_bn_pool(
64 65 66 67 68 69 70
        tmp,
        2, [16, 16],
        param=w1,
        bias=b,
        param_0=w0,
        is_test=is_test,
        use_cudnn=use_cudnn)
71

72
    tmp = conv_bn_pool(
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
        tmp,
        2, [32, 32],
        param=w1,
        bias=b,
        is_test=is_test,
        use_cudnn=use_cudnn)
    tmp = conv_bn_pool(
        tmp,
        2, [64, 64],
        param=w1,
        bias=b,
        is_test=is_test,
        use_cudnn=use_cudnn)
    tmp = conv_bn_pool(
        tmp,
        2, [128, 128],
        param=w1,
        bias=b,
        is_test=is_test,
        pooling=False,
        use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
94 95 96 97 98 99 100
    return tmp


def encoder_net(images,
                num_classes,
                rnn_hidden_size=200,
                regularizer=None,
W
wanghaoshuang 已提交
101
                gradient_clip=None,
102 103
                is_test=False,
                use_cudnn=False):
W
wanghaoshuang 已提交
104
    conv_features = ocr_convs(
W
wanghaoshuang 已提交
105 106 107
        images,
        regularizer=regularizer,
        gradient_clip=gradient_clip,
108 109
        is_test=is_test,
        use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
    sliced_feature = fluid.layers.im2sequence(
        input=conv_features,
        stride=[1, 1],
        filter_size=[conv_features.shape[2], 1])

    para_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))
    bias_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02),
        learning_rate=2.0)
    bias_attr_nobias = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))

    fc_1 = fluid.layers.fc(input=sliced_feature,
                           size=rnn_hidden_size * 3,
                           param_attr=para_attr,
                           bias_attr=bias_attr_nobias)
    fc_2 = fluid.layers.fc(input=sliced_feature,
                           size=rnn_hidden_size * 3,
                           param_attr=para_attr,
                           bias_attr=bias_attr_nobias)

    gru_forward = fluid.layers.dynamic_gru(
        input=fc_1,
        size=rnn_hidden_size,
        param_attr=para_attr,
        bias_attr=bias_attr,
        candidate_activation='relu')
    gru_backward = fluid.layers.dynamic_gru(
        input=fc_2,
        size=rnn_hidden_size,
        is_reverse=True,
        param_attr=para_attr,
        bias_attr=bias_attr,
        candidate_activation='relu')

    w_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.02))
    b_attr = fluid.ParamAttr(
        regularizer=regularizer,
        gradient_clip=gradient_clip,
        initializer=fluid.initializer.Normal(0.0, 0.0))

    fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
                             size=num_classes + 1,
                             param_attr=w_attr,
                             bias_attr=b_attr)
W
wanghaoshuang 已提交
165

W
wanghaoshuang 已提交
166 167 168 169
    return fc_out


def ctc_train_net(images, label, args, num_classes):
170 171 172
    L2_RATE = 0.0004
    LR = 1.0e-3
    MOMENTUM = 0.9
173
    learning_rate_decay = None
174
    regularizer = fluid.regularizer.L2Decay(L2_RATE)
W
wanghaoshuang 已提交
175

176 177 178 179 180
    fc_out = encoder_net(
        images,
        num_classes,
        regularizer=regularizer,
        use_cudnn=True if args.use_gpu else False)
181 182 183 184 185
    cost = fluid.layers.warpctc(
        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
    sum_cost = fluid.layers.reduce_sum(cost)
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)
W
wanghaoshuang 已提交
186 187 188
    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)
W
wanghaoshuang 已提交
189
    inference_program = fluid.default_main_program().clone(for_test=True)
190 191 192 193 194 195 196 197 198
    if learning_rate_decay == "piecewise_decay":
        learning_rate = fluid.layers.piecewise_decay([
            args.total_step / 4, args.total_step / 2, args.total_step * 3 / 4
        ], [LR, LR * 0.1, LR * 0.01, LR * 0.001])
    else:
        learning_rate = LR

    optimizer = fluid.optimizer.Momentum(
        learning_rate=learning_rate, momentum=MOMENTUM)
W
wanghaoshuang 已提交
199
    _, params_grads = optimizer.minimize(sum_cost)
200 201 202 203 204 205
    model_average = None
    if args.average_window > 0:
        model_average = fluid.optimizer.ModelAverage(
            args.average_window,
            min_average_window=args.min_average_window,
            max_average_window=args.max_average_window)
206
    return sum_cost, error_evaluator, inference_program, model_average
W
wanghaoshuang 已提交
207 208


209 210
def ctc_infer(images, num_classes, use_cudnn):
    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
211 212 213
    return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)


214 215
def ctc_eval(images, label, num_classes, use_cudnn):
    fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
W
wanghaoshuang 已提交
216 217 218 219 220 221 222 223
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)

    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)

    cost = fluid.layers.warpctc(
224
        input=fc_out, label=label, blank=num_classes, norm_by_times=True)
W
wanghaoshuang 已提交
225 226

    return error_evaluator, cost