test_imperative_ocr_attention_model.py 21.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16

17
import numpy as np
18 19
from test_imperative_base import new_program_scope

L
Leo Chen 已提交
20
import paddle
21
from paddle import fluid
22
from paddle.fluid import core
23
from paddle.fluid.dygraph.base import to_variable
24
from paddle.nn import BatchNorm, Linear
25 26


27
class Config:
28 29 30
    '''
    config for training
    '''
31

32
    # encoder rnn hidden_size
33
    encoder_size = 8
34
    # decoder size for decoder stage
35
    decoder_size = 8
36
    # size for word embedding
37
    word_vector_dim = 8
38
    # max length for label padding
39
    max_length = 3
40 41 42 43 44
    # optimizer setting
    LR = 1.0
    learning_rate_decay = None

    # batch size to train
45
    batch_size = 2
46
    # class number to classify
47
    num_classes = 64
48 49 50 51 52 53 54 55 56 57 58

    use_gpu = False
    # special label for start and end
    SOS = 0
    EOS = 1
    # settings for ctc data, not use in unittest
    DATA_DIR_NAME = "./dataset/ctc_data/data"
    TRAIN_DATA_DIR_NAME = "train_images"
    TRAIN_LIST_FILE_NAME = "train.list"

    # data shape for input image
59
    DATA_SHAPE = [1, 16, 64]
60 61


62
class ConvBNPool(paddle.nn.Layer):
63 64 65 66 67 68 69 70 71 72
    def __init__(
        self,
        group,
        out_ch,
        channels,
        act="relu",
        is_test=False,
        pool=True,
        use_cudnn=True,
    ):
73
        super().__init__()
74 75 76 77
        self.group = group
        self.pool = pool

        filter_size = 3
78
        conv_std_0 = (2.0 / (filter_size**2 * channels[0])) ** 0.5
79
        conv_param_0 = fluid.ParamAttr(
80
            initializer=paddle.nn.initializer.Normal(0.0, conv_std_0)
81
        )
82

83
        conv_std_1 = (2.0 / (filter_size**2 * channels[1])) ** 0.5
84
        conv_param_1 = fluid.ParamAttr(
85
            initializer=paddle.nn.initializer.Normal(0.0, conv_std_1)
86 87
        )

88
        self.conv_0_layer = paddle.nn.Conv2D(
89 90 91 92
            channels[0],
            out_ch[0],
            3,
            padding=1,
93
            weight_attr=conv_param_0,
94 95
            bias_attr=False,
        )
96
        self.bn_0_layer = BatchNorm(out_ch[0], act=act, is_test=is_test)
97
        self.conv_1_layer = paddle.nn.Conv2D(
98
            out_ch[0],
99 100
            out_ch[1],
            3,
101
            padding=1,
102
            weight_attr=conv_param_1,
103 104
            bias_attr=False,
        )
105
        self.bn_1_layer = BatchNorm(out_ch[1], act=act, is_test=is_test)
106 107

        if self.pool:
108 109 110
            self.pool_layer = paddle.nn.MaxPool2D(
                kernel_size=2,
                stride=2,
111 112
                ceil_mode=True,
            )
113 114 115 116 117 118 119 120 121 122 123 124

    def forward(self, inputs):
        conv_0 = self.conv_0_layer(inputs)
        bn_0 = self.bn_0_layer(conv_0)
        conv_1 = self.conv_1_layer(bn_0)
        bn_1 = self.bn_1_layer(conv_1)
        if self.pool:
            bn_pool = self.pool_layer(bn_1)
            return bn_pool
        return bn_1


125
class OCRConv(paddle.nn.Layer):
126
    def __init__(self, is_test=False, use_cudnn=True):
127
        super().__init__()
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
        self.conv_bn_pool_1 = ConvBNPool(
            2, [8, 8], [1, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_2 = ConvBNPool(
            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_3 = ConvBNPool(
            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_4 = ConvBNPool(
            2,
            [16, 16],
            [8, 16],
            is_test=is_test,
            pool=False,
            use_cudnn=use_cudnn,
        )
145 146 147 148 149 150 151 152 153 154

    def forward(self, inputs):
        inputs_1 = self.conv_bn_pool_1(inputs)
        inputs_2 = self.conv_bn_pool_2(inputs_1)
        inputs_3 = self.conv_bn_pool_3(inputs_2)
        inputs_4 = self.conv_bn_pool_4(inputs_3)

        return inputs_4


155
class DynamicGRU(paddle.nn.Layer):
156 157 158 159 160 161 162 163 164 165 166
    def __init__(
        self,
        size,
        param_attr=None,
        bias_attr=None,
        is_reverse=False,
        gate_activation='sigmoid',
        candidate_activation='tanh',
        h_0=None,
        origin_mode=False,
    ):
167
        super().__init__()
168

W
wangzhen38 已提交
169
        self.gru_unit = paddle.nn.GRUCell(
170
            size * 3,
W
wangzhen38 已提交
171
            size,
172
        )
173 174 175

        self.h_0 = h_0
        self.is_reverse = is_reverse
176
        self.size = size
177 178 179 180 181 182 183

    def forward(self, inputs):
        hidden = self.h_0
        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
                i = inputs.shape[1] - 1 - i
2
201716010711 已提交
184
            input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1])
185
            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
W
wangzhen38 已提交
186
            hidden, reset = self.gru_unit(input_, hidden)
187
            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
188 189 190 191
            if self.is_reverse:
                res = [hidden_] + res
            else:
                res.append(hidden_)
192
        res = paddle.concat(res, axis=1)
193 194 195
        return res


196
class EncoderNet(paddle.nn.Layer):
197 198 199
    def __init__(
        self, rnn_hidden_size=Config.encoder_size, is_test=False, use_cudnn=True
    ):
200
        super().__init__()
201
        self.rnn_hidden_size = rnn_hidden_size
202
        para_attr = fluid.ParamAttr(
203
            initializer=paddle.nn.initializer.Normal(0.0, 0.02)
204 205
        )
        bias_attr = fluid.ParamAttr(
206 207
            initializer=paddle.nn.initializer.Normal(0.0, 0.02),
            learning_rate=2.0,
208
        )
209
        if fluid.framework.in_dygraph_mode():
210 211 212
            h_0 = np.zeros(
                (Config.batch_size, rnn_hidden_size), dtype="float32"
            )
213 214
            h_0 = to_variable(h_0)
        else:
215
            h_0 = paddle.tensor.fill_constant(
216 217
                shape=[Config.batch_size, rnn_hidden_size],
                dtype='float32',
218 219
                value=0,
            )
220 221
        self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)

222
        self.fc_1_layer = Linear(
223
            32, rnn_hidden_size * 3, weight_attr=para_attr, bias_attr=False
224 225
        )
        self.fc_2_layer = Linear(
226
            32, rnn_hidden_size * 3, weight_attr=para_attr, bias_attr=False
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
        )
        self.gru_forward_layer = DynamicGRU(
            size=rnn_hidden_size,
            h_0=h_0,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu',
        )
        self.gru_backward_layer = DynamicGRU(
            size=rnn_hidden_size,
            h_0=h_0,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu',
            is_reverse=True,
        )

        self.encoded_proj_fc = Linear(
            rnn_hidden_size * 2, Config.decoder_size, bias_attr=False
        )
247 248 249

    def forward(self, inputs):
        conv_features = self.ocr_convs(inputs)
250
        # sliced_feature = fluid.layers.im2sequence(
251 252 253 254
        #    input=conv_features,
        #    stride=[1, 1],
        #    filter_size=[conv_features.shape[2], 1])

255
        transpose_conv_features = paddle.transpose(
256 257
            conv_features, perm=[0, 3, 1, 2]
        )
258
        sliced_feature = paddle.reshape(
259 260 261 262 263 264 265 266
            transpose_conv_features,
            [
                -1,
                8,
                transpose_conv_features.shape[2]
                * transpose_conv_features.shape[3],
            ],
        )
267 268 269 270 271 272
        fc_1 = self.fc_1_layer(sliced_feature)
        fc_2 = self.fc_2_layer(sliced_feature)
        gru_forward = self.gru_forward_layer(fc_1)

        gru_backward = self.gru_backward_layer(fc_2)

273
        encoded_vector = paddle.concat([gru_forward, gru_backward], axis=2)
274 275 276 277 278 279

        encoded_proj = self.encoded_proj_fc(encoded_vector)

        return gru_backward, encoded_vector, encoded_proj


280
class SimpleAttention(paddle.nn.Layer):
281
    def __init__(self, decoder_size):
282
        super().__init__()
283

284 285
        self.fc_1 = Linear(decoder_size, decoder_size, bias_attr=False)
        self.fc_2 = Linear(decoder_size, 1, bias_attr=False)
286 287 288 289

    def forward(self, encoder_vec, encoder_proj, decoder_state):

        decoder_state_fc = self.fc_1(decoder_state)
290 291
        decoder_state_proj_reshape = paddle.reshape(
            decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]]
292
        )
293 294 295
        decoder_state_expand = paddle.expand(
            decoder_state_proj_reshape,
            [-1, encoder_proj.shape[1], -1],
296
        )
297
        concated = paddle.add(encoder_proj, decoder_state_expand)
298
        concated = paddle.tanh(x=concated)
299
        attention_weight = self.fc_2(concated)
300

301
        weights_reshape = paddle.reshape(
302 303
            x=attention_weight,
            shape=[attention_weight.shape[0], attention_weight.shape[1]],
304
        )
305

306
        weights_reshape = paddle.nn.functional.softmax(weights_reshape)
307
        scaled = paddle.tensor.math._multiply_with_axis(
308 309
            x=encoder_vec, y=weights_reshape, axis=0
        )
310
        context = paddle.sum(scaled, axis=1)
311

312 313 314
        return context


315
class GRUDecoderWithAttention(paddle.nn.Layer):
316
    def __init__(self, decoder_size, num_classes):
317
        super().__init__()
318 319
        self.simple_attention = SimpleAttention(decoder_size)

320 321 322 323 324 325
        self.fc_1_layer = Linear(
            Config.encoder_size * 2, decoder_size * 3, bias_attr=False
        )
        self.fc_2_layer = Linear(
            decoder_size, decoder_size * 3, bias_attr=False
        )
W
wangzhen38 已提交
326
        self.gru_unit = paddle.nn.GRUCell(decoder_size * 3, decoder_size)
327
        self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None)
328 329 330

        self.decoder_size = decoder_size

331 332 333
    def forward(
        self, target_embedding, encoder_vec, encoder_proj, decoder_boot
    ):
334 335 336
        res = []
        hidden_mem = decoder_boot
        for i in range(target_embedding.shape[1]):
2
201716010711 已提交
337
            current_word = paddle.slice(
338 339
                target_embedding, axes=[1], starts=[i], ends=[i + 1]
            )
340 341
            current_word = paddle.reshape(
                current_word, [-1, current_word.shape[2]]
342 343 344 345 346
            )

            context = self.simple_attention(
                encoder_vec, encoder_proj, hidden_mem
            )
347 348
            fc_1 = self.fc_1_layer(context)
            fc_2 = self.fc_2_layer(current_word)
349
            decoder_inputs = paddle.add(x=fc_1, y=fc_2)
350

W
wangzhen38 已提交
351
            h, _ = self.gru_unit(decoder_inputs, hidden_mem)
352 353
            hidden_mem = h
            out = self.out_layer(h)
354
            out = paddle.nn.functional.softmax(out)
355 356
            res.append(out)

357
        res1 = paddle.concat(res, axis=1)
358 359 360 361

        return res1


362
class OCRAttention(paddle.nn.Layer):
363
    def __init__(self):
364
        super().__init__()
365
        self.encoder_net = EncoderNet()
366 367 368 369 370
        self.fc = Linear(
            Config.encoder_size,
            Config.decoder_size,
            bias_attr=False,
        )
371 372
        self.embedding = paddle.nn.Embedding(
            Config.num_classes + 2, Config.word_vector_dim
373
        )
374
        self.gru_decoder_with_attention = GRUDecoderWithAttention(
375 376
            Config.decoder_size, Config.num_classes
        )
377 378 379

    def forward(self, inputs, label_in):
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
2
201716010711 已提交
380
        backward_first = paddle.slice(
381 382
            gru_backward, axes=[1], starts=[0], ends=[1]
        )
383 384
        backward_first = paddle.reshape(
            backward_first, [-1, backward_first.shape[2]]
385
        )
386
        decoder_boot = self.fc(backward_first)
387
        decoder_boot = paddle.nn.functional.relu(decoder_boot)
388
        label_in = paddle.reshape(label_in, [-1])
389 390
        trg_embedding = self.embedding(label_in)

391
        trg_embedding = paddle.reshape(
392 393 394
            trg_embedding,
            [-1, Config.max_length, trg_embedding.shape[1]],
        )
395

396 397 398
        prediction = self.gru_decoder_with_attention(
            trg_embedding, encoded_vector, encoded_proj, decoder_boot
        )
399 400 401 402 403

        return prediction


class TestDygraphOCRAttention(unittest.TestCase):
H
hong 已提交
404
    def test_ocr_test(self):
405
        seed = 90
406
        epoch_num = 1
407
        if core.is_compiled_with_cuda():
408
            batch_num = 3
409
        else:
410
            batch_num = 2
411
        np.random.seed = seed
412 413 414 415 416 417 418 419 420
        image_np = np.random.randn(
            Config.batch_size,
            Config.DATA_SHAPE[0],
            Config.DATA_SHAPE[1],
            Config.DATA_SHAPE[2],
        ).astype('float32')
        label_in_np = np.arange(0, Config.max_length, dtype='int64').reshape(
            [1, Config.max_length]
        )
421
        for i in range(2, Config.batch_size + 1):
422
            label_in_np = np.vstack(
423 424 425 426 427 428 429 430 431
                (
                    label_in_np,
                    np.arange(
                        (i - 1) * Config.max_length,
                        i * Config.max_length,
                        dtype='int64',
                    ).reshape([1, Config.max_length]),
                )
            )
432

433 434 435
        label_out_np = np.arange(0, Config.max_length, dtype='int64').reshape(
            [1, Config.max_length]
        )
436
        for i in range(2, Config.batch_size + 1):
437
            label_out_np = np.vstack(
438 439 440 441 442 443 444 445 446
                (
                    label_out_np,
                    np.arange(
                        (i - 1) * Config.max_length,
                        i * Config.max_length,
                        dtype='int64',
                    ).reshape([1, Config.max_length]),
                )
            )
447

H
hong 已提交
448
        def run_dygraph():
449
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
C
cnn 已提交
450
            paddle.seed(seed)
L
Leo Chen 已提交
451
            paddle.framework.random._manual_program_seed(seed)
452
            ocr_attention = OCRAttention()
453 454 455

            if Config.learning_rate_decay == "piecewise_decay":
                learning_rate = fluid.layers.piecewise_decay(
456 457
                    [50000], [Config.LR, Config.LR * 0.01]
                )
458 459
            else:
                learning_rate = Config.LR
460
            optimizer = fluid.optimizer.SGD(
461 462
                learning_rate=0.001, parameter_list=ocr_attention.parameters()
            )
463 464 465 466 467 468 469
            dy_param_init_value = {}
            for param in ocr_attention.parameters():
                dy_param_init_value[param.name] = param.numpy()
            for epoch in range(epoch_num):
                for batch_id in range(batch_num):
                    label_in = to_variable(label_in_np)
                    label_out = to_variable(label_out_np)
470
                    label_out.stop_gradient = True
471 472
                    img = to_variable(image_np)
                    dy_prediction = ocr_attention(img, label_in)
473 474 475
                    label_out = paddle.reshape(label_out, [-1, 1])
                    dy_prediction = paddle.reshape(
                        dy_prediction, [label_out.shape[0], -1]
476
                    )
477 478 479 480 481
                    loss = paddle.nn.functional.cross_entropy(
                        input=dy_prediction,
                        label=label_out,
                        reduction='none',
                        use_softmax=False,
482
                    )
483
                    avg_loss = paddle.sum(loss)
484 485 486 487 488 489 490

                    dy_out = avg_loss.numpy()

                    if epoch == 0 and batch_id == 0:
                        for param in ocr_attention.parameters():
                            if param.name not in dy_param_init_value:
                                dy_param_init_value[param.name] = param.numpy()
491
                    avg_loss.backward()
492 493 494
                    dy_grad_value = {}
                    for param in ocr_attention.parameters():
                        if param.trainable:
495
                            np_array = np.array(
496 497 498 499 500
                                param._grad_ivar().value().get_tensor()
                            )
                            dy_grad_value[
                                param.name + core.grad_var_suffix()
                            ] = np_array
501 502 503 504 505 506 507

                    optimizer.minimize(avg_loss)
                    ocr_attention.clear_gradients()
                    dy_param_value = {}
                    for param in ocr_attention.parameters():
                        dy_param_value[param.name] = param.numpy()

H
hong 已提交
508 509 510 511 512 513
            return dy_out, dy_param_init_value, dy_param_value

        with fluid.dygraph.guard():
            dy_out, dy_param_init_value, dy_param_value = run_dygraph()

        with fluid.dygraph.guard():
514 515 516 517 518
            (
                eager_out,
                eager_param_init_value,
                eager_param_value,
            ) = run_dygraph()
H
hong 已提交
519

520
        with new_program_scope():
C
cnn 已提交
521
            paddle.seed(seed)
L
Leo Chen 已提交
522
            paddle.framework.random._manual_program_seed(seed)
523 524 525 526 527
            exe = fluid.Executor(
                fluid.CPUPlace()
                if not core.is_compiled_with_cuda()
                else fluid.CUDAPlace(0)
            )
528
            ocr_attention = OCRAttention()
529 530 531

            if Config.learning_rate_decay == "piecewise_decay":
                learning_rate = fluid.layers.piecewise_decay(
532 533
                    [50000], [Config.LR, Config.LR * 0.01]
                )
534 535 536 537 538
            else:
                learning_rate = Config.LR

            optimizer = fluid.optimizer.SGD(learning_rate=0.001)

G
GGBond8488 已提交
539 540
            images = paddle.static.data(
                name='pixel', shape=[-1] + Config.DATA_SHAPE, dtype='float32'
541
            )
G
GGBond8488 已提交
542 543 544
            images.desc.set_need_check_feed(False)
            static_label_in = paddle.static.data(
                name='label_in', shape=[-1, 1], dtype='int64', lod_level=0
545
            )
G
GGBond8488 已提交
546 547 548
            static_label_in.desc.set_need_check_feed(False)
            static_label_out = paddle.static.data(
                name='label_out', shape=[-1, 1], dtype='int64', lod_level=0
549
            )
G
GGBond8488 已提交
550 551
            static_label_out.desc.set_need_check_feed(False)

552
            static_label_out.stop_gradient = True
553 554 555 556
            static_label_out.trainable = False

            static_prediction = ocr_attention(images, static_label_in)

557
            static_prediction = paddle.reshape(
558 559
                static_prediction, shape=[-1, Config.num_classes + 2]
            )
560

561 562 563 564 565
            cost = paddle.nn.functional.cross_entropy(
                input=static_prediction,
                label=static_label_out,
                reduction='none',
                use_softmax=False,
566
            )
567
            static_avg_loss = paddle.sum(cost)
568 569 570 571 572 573 574 575 576
            # param_grad_list = fluid.backward.append_backward(static_avg_loss)
            optimizer.minimize(static_avg_loss)

            static_param_init_value = {}
            static_param_name_list = []
            static_grad_name_list = []
            for param in ocr_attention.parameters():
                static_param_name_list.append(param.name)
                if param.trainable:
577 578 579
                    static_grad_name_list.append(
                        param.name + core.grad_var_suffix()
                    )
580

581 582 583 584
            out = exe.run(
                fluid.default_startup_program(),
                fetch_list=static_param_name_list,
            )
585 586 587 588 589 590 591 592 593 594 595 596

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            fetch_list = [static_avg_loss.name]
            fetch_list.extend(static_param_name_list)
            fetch_list.extend(static_grad_name_list)
            for epoch in range(epoch_num):
                for batch_id in range(batch_num):
                    static_label_in = label_in_np
                    static_label_out = label_out_np
                    static_label_out = static_label_out.reshape((-1, 1))
597 598 599 600 601 602 603 604 605
                    out = exe.run(
                        fluid.default_main_program(),
                        feed={
                            "pixel": image_np,
                            "label_in": static_label_in,
                            "label_out": static_label_out,
                        },
                        fetch_list=fetch_list,
                    )
606 607 608 609
                    static_param_value = {}
                    static_grad_value = {}
                    static_out = out[0]
                    for i in range(1, len(static_param_name_list) + 1):
610 611 612
                        static_param_value[static_param_name_list[i - 1]] = out[
                            i
                        ]
613
                    grad_start_pos = len(static_param_name_list) + 1
614 615 616 617 618 619 620
                    for i in range(
                        grad_start_pos,
                        len(static_grad_name_list) + grad_start_pos,
                    ):
                        static_grad_value[
                            static_grad_name_list[i - grad_start_pos]
                        ] = out[i]
621

622
        np.testing.assert_allclose(static_out, dy_out, rtol=1e-05, atol=1e-8)
623

624
        for key, value in static_param_init_value.items():
625
            np.testing.assert_array_equal(value, dy_param_init_value[key])
626

627
        for key, value in static_param_value.items():
628 629 630
            np.testing.assert_allclose(
                value, dy_param_value[key], rtol=1e-05, atol=1e-8
            )
631

H
hong 已提交
632
        # check eager here
633
        np.testing.assert_allclose(static_out, eager_out, rtol=1e-05, atol=1e-8)
H
hong 已提交
634

635
        for key, value in static_param_init_value.items():
636
            np.testing.assert_array_equal(value, eager_param_init_value[key])
H
hong 已提交
637

638
        for key, value in static_param_value.items():
639 640 641
            np.testing.assert_allclose(
                value, eager_param_value[key], rtol=1e-05, atol=1e-8
            )
H
hong 已提交
642

643 644

if __name__ == '__main__':
645
    paddle.enable_static()
646
    unittest.main()