test_imperative_ocr_attention_model.py 21.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
16

17
import numpy as np
18 19
from test_imperative_base import new_program_scope

L
Leo Chen 已提交
20
import paddle
21
from paddle import fluid
22
from paddle.fluid import core
23
from paddle.fluid.dygraph.base import to_variable
24
from paddle.nn import BatchNorm, Linear
25 26


27
class Config:
28 29 30
    '''
    config for training
    '''
31

32
    # encoder rnn hidden_size
33
    encoder_size = 8
34
    # decoder size for decoder stage
35
    decoder_size = 8
36
    # size for word embedding
37
    word_vector_dim = 8
38
    # max length for label padding
39
    max_length = 3
40 41 42 43 44
    # optimizer setting
    LR = 1.0
    learning_rate_decay = None

    # batch size to train
45
    batch_size = 2
46
    # class number to classify
47
    num_classes = 64
48 49 50 51 52 53 54 55 56 57 58

    use_gpu = False
    # special label for start and end
    SOS = 0
    EOS = 1
    # settings for ctc data, not use in unittest
    DATA_DIR_NAME = "./dataset/ctc_data/data"
    TRAIN_DATA_DIR_NAME = "train_images"
    TRAIN_LIST_FILE_NAME = "train.list"

    # data shape for input image
59
    DATA_SHAPE = [1, 16, 64]
60 61


62
class ConvBNPool(paddle.nn.Layer):
63 64 65 66 67 68 69 70 71 72
    def __init__(
        self,
        group,
        out_ch,
        channels,
        act="relu",
        is_test=False,
        pool=True,
        use_cudnn=True,
    ):
73
        super().__init__()
74 75 76 77
        self.group = group
        self.pool = pool

        filter_size = 3
78
        conv_std_0 = (2.0 / (filter_size**2 * channels[0])) ** 0.5
79
        conv_param_0 = fluid.ParamAttr(
80
            initializer=paddle.nn.initializer.Normal(0.0, conv_std_0)
81
        )
82

83
        conv_std_1 = (2.0 / (filter_size**2 * channels[1])) ** 0.5
84
        conv_param_1 = fluid.ParamAttr(
85
            initializer=paddle.nn.initializer.Normal(0.0, conv_std_1)
86 87
        )

88
        self.conv_0_layer = paddle.nn.Conv2D(
89 90 91 92
            channels[0],
            out_ch[0],
            3,
            padding=1,
93
            weight_attr=conv_param_0,
94 95
            bias_attr=False,
        )
96
        self.bn_0_layer = BatchNorm(out_ch[0], act=act, is_test=is_test)
97
        self.conv_1_layer = paddle.nn.Conv2D(
98
            out_ch[0],
99 100
            out_ch[1],
            3,
101
            padding=1,
102
            weight_attr=conv_param_1,
103 104
            bias_attr=False,
        )
105
        self.bn_1_layer = BatchNorm(out_ch[1], act=act, is_test=is_test)
106 107

        if self.pool:
108 109 110
            self.pool_layer = paddle.nn.MaxPool2D(
                kernel_size=2,
                stride=2,
111 112
                ceil_mode=True,
            )
113 114 115 116 117 118 119 120 121 122 123 124

    def forward(self, inputs):
        conv_0 = self.conv_0_layer(inputs)
        bn_0 = self.bn_0_layer(conv_0)
        conv_1 = self.conv_1_layer(bn_0)
        bn_1 = self.bn_1_layer(conv_1)
        if self.pool:
            bn_pool = self.pool_layer(bn_1)
            return bn_pool
        return bn_1


125
class OCRConv(paddle.nn.Layer):
126
    def __init__(self, is_test=False, use_cudnn=True):
127
        super().__init__()
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
        self.conv_bn_pool_1 = ConvBNPool(
            2, [8, 8], [1, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_2 = ConvBNPool(
            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_3 = ConvBNPool(
            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn
        )
        self.conv_bn_pool_4 = ConvBNPool(
            2,
            [16, 16],
            [8, 16],
            is_test=is_test,
            pool=False,
            use_cudnn=use_cudnn,
        )
145 146 147 148 149 150 151 152 153 154

    def forward(self, inputs):
        inputs_1 = self.conv_bn_pool_1(inputs)
        inputs_2 = self.conv_bn_pool_2(inputs_1)
        inputs_3 = self.conv_bn_pool_3(inputs_2)
        inputs_4 = self.conv_bn_pool_4(inputs_3)

        return inputs_4


155
class DynamicGRU(paddle.nn.Layer):
156 157 158 159 160 161 162 163 164 165 166
    def __init__(
        self,
        size,
        param_attr=None,
        bias_attr=None,
        is_reverse=False,
        gate_activation='sigmoid',
        candidate_activation='tanh',
        h_0=None,
        origin_mode=False,
    ):
167
        super().__init__()
168

W
wangzhen38 已提交
169
        self.gru_unit = paddle.nn.GRUCell(
170
            size * 3,
W
wangzhen38 已提交
171
            size,
172
        )
173 174 175

        self.h_0 = h_0
        self.is_reverse = is_reverse
176
        self.size = size
177 178 179 180 181 182 183

    def forward(self, inputs):
        hidden = self.h_0
        res = []
        for i in range(inputs.shape[1]):
            if self.is_reverse:
                i = inputs.shape[1] - 1 - i
2
201716010711 已提交
184
            input_ = paddle.slice(inputs, axes=[1], starts=[i], ends=[i + 1])
185
            input_ = paddle.reshape(input_, [-1, input_.shape[2]])
W
wangzhen38 已提交
186
            hidden, reset = self.gru_unit(input_, hidden)
187
            hidden_ = paddle.reshape(hidden, [-1, 1, hidden.shape[1]])
188 189 190 191
            if self.is_reverse:
                res = [hidden_] + res
            else:
                res.append(hidden_)
192
        res = paddle.concat(res, axis=1)
193 194 195
        return res


196
class EncoderNet(paddle.nn.Layer):
197 198 199
    def __init__(
        self, rnn_hidden_size=Config.encoder_size, is_test=False, use_cudnn=True
    ):
200
        super().__init__()
201
        self.rnn_hidden_size = rnn_hidden_size
202
        para_attr = fluid.ParamAttr(
203
            initializer=paddle.nn.initializer.Normal(0.0, 0.02)
204 205
        )
        bias_attr = fluid.ParamAttr(
206 207
            initializer=paddle.nn.initializer.Normal(0.0, 0.02),
            learning_rate=2.0,
208
        )
209
        if fluid.framework.in_dygraph_mode():
210 211 212
            h_0 = np.zeros(
                (Config.batch_size, rnn_hidden_size), dtype="float32"
            )
213 214
            h_0 = to_variable(h_0)
        else:
215
            h_0 = paddle.tensor.fill_constant(
216 217
                shape=[Config.batch_size, rnn_hidden_size],
                dtype='float32',
218 219
                value=0,
            )
220 221
        self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)

222
        self.fc_1_layer = Linear(
223
            32, rnn_hidden_size * 3, weight_attr=para_attr, bias_attr=False
224 225
        )
        self.fc_2_layer = Linear(
226
            32, rnn_hidden_size * 3, weight_attr=para_attr, bias_attr=False
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
        )
        self.gru_forward_layer = DynamicGRU(
            size=rnn_hidden_size,
            h_0=h_0,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu',
        )
        self.gru_backward_layer = DynamicGRU(
            size=rnn_hidden_size,
            h_0=h_0,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu',
            is_reverse=True,
        )

        self.encoded_proj_fc = Linear(
            rnn_hidden_size * 2, Config.decoder_size, bias_attr=False
        )
247 248 249

    def forward(self, inputs):
        conv_features = self.ocr_convs(inputs)
250
        # sliced_feature = fluid.layers.im2sequence(
251 252 253 254
        #    input=conv_features,
        #    stride=[1, 1],
        #    filter_size=[conv_features.shape[2], 1])

255
        transpose_conv_features = paddle.transpose(
256 257
            conv_features, perm=[0, 3, 1, 2]
        )
258
        sliced_feature = paddle.reshape(
259 260 261 262 263 264 265 266
            transpose_conv_features,
            [
                -1,
                8,
                transpose_conv_features.shape[2]
                * transpose_conv_features.shape[3],
            ],
        )
267 268 269 270 271 272
        fc_1 = self.fc_1_layer(sliced_feature)
        fc_2 = self.fc_2_layer(sliced_feature)
        gru_forward = self.gru_forward_layer(fc_1)

        gru_backward = self.gru_backward_layer(fc_2)

273
        encoded_vector = paddle.concat([gru_forward, gru_backward], axis=2)
274 275 276 277 278 279

        encoded_proj = self.encoded_proj_fc(encoded_vector)

        return gru_backward, encoded_vector, encoded_proj


280
class SimpleAttention(paddle.nn.Layer):
281
    def __init__(self, decoder_size):
282
        super().__init__()
283

284 285
        self.fc_1 = Linear(decoder_size, decoder_size, bias_attr=False)
        self.fc_2 = Linear(decoder_size, 1, bias_attr=False)
286 287 288

    def forward(self, encoder_vec, encoder_proj, decoder_state):
        decoder_state_fc = self.fc_1(decoder_state)
289 290
        decoder_state_proj_reshape = paddle.reshape(
            decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]]
291
        )
292 293 294
        decoder_state_expand = paddle.expand(
            decoder_state_proj_reshape,
            [-1, encoder_proj.shape[1], -1],
295
        )
296
        concated = paddle.add(encoder_proj, decoder_state_expand)
297
        concated = paddle.tanh(x=concated)
298
        attention_weight = self.fc_2(concated)
299

300
        weights_reshape = paddle.reshape(
301 302
            x=attention_weight,
            shape=[attention_weight.shape[0], attention_weight.shape[1]],
303
        )
304

305
        weights_reshape = paddle.nn.functional.softmax(weights_reshape)
306
        scaled = paddle.tensor.math._multiply_with_axis(
307 308
            x=encoder_vec, y=weights_reshape, axis=0
        )
309
        context = paddle.sum(scaled, axis=1)
310

311 312 313
        return context


314
class GRUDecoderWithAttention(paddle.nn.Layer):
315
    def __init__(self, decoder_size, num_classes):
316
        super().__init__()
317 318
        self.simple_attention = SimpleAttention(decoder_size)

319 320 321 322 323 324
        self.fc_1_layer = Linear(
            Config.encoder_size * 2, decoder_size * 3, bias_attr=False
        )
        self.fc_2_layer = Linear(
            decoder_size, decoder_size * 3, bias_attr=False
        )
W
wangzhen38 已提交
325
        self.gru_unit = paddle.nn.GRUCell(decoder_size * 3, decoder_size)
326
        self.out_layer = Linear(decoder_size, num_classes + 2, bias_attr=None)
327 328 329

        self.decoder_size = decoder_size

330 331 332
    def forward(
        self, target_embedding, encoder_vec, encoder_proj, decoder_boot
    ):
333 334 335
        res = []
        hidden_mem = decoder_boot
        for i in range(target_embedding.shape[1]):
2
201716010711 已提交
336
            current_word = paddle.slice(
337 338
                target_embedding, axes=[1], starts=[i], ends=[i + 1]
            )
339 340
            current_word = paddle.reshape(
                current_word, [-1, current_word.shape[2]]
341 342 343 344 345
            )

            context = self.simple_attention(
                encoder_vec, encoder_proj, hidden_mem
            )
346 347
            fc_1 = self.fc_1_layer(context)
            fc_2 = self.fc_2_layer(current_word)
348
            decoder_inputs = paddle.add(x=fc_1, y=fc_2)
349

W
wangzhen38 已提交
350
            h, _ = self.gru_unit(decoder_inputs, hidden_mem)
351 352
            hidden_mem = h
            out = self.out_layer(h)
353
            out = paddle.nn.functional.softmax(out)
354 355
            res.append(out)

356
        res1 = paddle.concat(res, axis=1)
357 358 359 360

        return res1


361
class OCRAttention(paddle.nn.Layer):
362
    def __init__(self):
363
        super().__init__()
364
        self.encoder_net = EncoderNet()
365 366 367 368 369
        self.fc = Linear(
            Config.encoder_size,
            Config.decoder_size,
            bias_attr=False,
        )
370 371
        self.embedding = paddle.nn.Embedding(
            Config.num_classes + 2, Config.word_vector_dim
372
        )
373
        self.gru_decoder_with_attention = GRUDecoderWithAttention(
374 375
            Config.decoder_size, Config.num_classes
        )
376 377 378

    def forward(self, inputs, label_in):
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
2
201716010711 已提交
379
        backward_first = paddle.slice(
380 381
            gru_backward, axes=[1], starts=[0], ends=[1]
        )
382 383
        backward_first = paddle.reshape(
            backward_first, [-1, backward_first.shape[2]]
384
        )
385
        decoder_boot = self.fc(backward_first)
386
        decoder_boot = paddle.nn.functional.relu(decoder_boot)
387
        label_in = paddle.reshape(label_in, [-1])
388 389
        trg_embedding = self.embedding(label_in)

390
        trg_embedding = paddle.reshape(
391 392 393
            trg_embedding,
            [-1, Config.max_length, trg_embedding.shape[1]],
        )
394

395 396 397
        prediction = self.gru_decoder_with_attention(
            trg_embedding, encoded_vector, encoded_proj, decoder_boot
        )
398 399 400 401 402

        return prediction


class TestDygraphOCRAttention(unittest.TestCase):
H
hong 已提交
403
    def test_ocr_test(self):
404
        seed = 90
405
        epoch_num = 1
406
        if core.is_compiled_with_cuda():
407
            batch_num = 3
408
        else:
409
            batch_num = 2
410
        np.random.seed = seed
411 412 413 414 415 416 417 418 419
        image_np = np.random.randn(
            Config.batch_size,
            Config.DATA_SHAPE[0],
            Config.DATA_SHAPE[1],
            Config.DATA_SHAPE[2],
        ).astype('float32')
        label_in_np = np.arange(0, Config.max_length, dtype='int64').reshape(
            [1, Config.max_length]
        )
420
        for i in range(2, Config.batch_size + 1):
421
            label_in_np = np.vstack(
422 423 424 425 426 427 428 429 430
                (
                    label_in_np,
                    np.arange(
                        (i - 1) * Config.max_length,
                        i * Config.max_length,
                        dtype='int64',
                    ).reshape([1, Config.max_length]),
                )
            )
431

432 433 434
        label_out_np = np.arange(0, Config.max_length, dtype='int64').reshape(
            [1, Config.max_length]
        )
435
        for i in range(2, Config.batch_size + 1):
436
            label_out_np = np.vstack(
437 438 439 440 441 442 443 444 445
                (
                    label_out_np,
                    np.arange(
                        (i - 1) * Config.max_length,
                        i * Config.max_length,
                        dtype='int64',
                    ).reshape([1, Config.max_length]),
                )
            )
446

H
hong 已提交
447
        def run_dygraph():
448
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})
C
cnn 已提交
449
            paddle.seed(seed)
L
Leo Chen 已提交
450
            paddle.framework.random._manual_program_seed(seed)
451
            ocr_attention = OCRAttention()
452 453

            if Config.learning_rate_decay == "piecewise_decay":
D
Difer 已提交
454
                learning_rate = paddle.optimizer.lr.piecewise_decay(
455 456
                    [50000], [Config.LR, Config.LR * 0.01]
                )
457 458
            else:
                learning_rate = Config.LR
L
LoneRanger 已提交
459 460
            optimizer = paddle.optimizer.SGD(
                learning_rate=0.001, parameters=ocr_attention.parameters()
461
            )
462 463 464 465 466 467 468
            dy_param_init_value = {}
            for param in ocr_attention.parameters():
                dy_param_init_value[param.name] = param.numpy()
            for epoch in range(epoch_num):
                for batch_id in range(batch_num):
                    label_in = to_variable(label_in_np)
                    label_out = to_variable(label_out_np)
469
                    label_out.stop_gradient = True
470 471
                    img = to_variable(image_np)
                    dy_prediction = ocr_attention(img, label_in)
472 473 474
                    label_out = paddle.reshape(label_out, [-1, 1])
                    dy_prediction = paddle.reshape(
                        dy_prediction, [label_out.shape[0], -1]
475
                    )
476 477 478 479 480
                    loss = paddle.nn.functional.cross_entropy(
                        input=dy_prediction,
                        label=label_out,
                        reduction='none',
                        use_softmax=False,
481
                    )
482
                    avg_loss = paddle.sum(loss)
483 484 485 486 487 488 489

                    dy_out = avg_loss.numpy()

                    if epoch == 0 and batch_id == 0:
                        for param in ocr_attention.parameters():
                            if param.name not in dy_param_init_value:
                                dy_param_init_value[param.name] = param.numpy()
490
                    avg_loss.backward()
491 492 493
                    dy_grad_value = {}
                    for param in ocr_attention.parameters():
                        if param.trainable:
494
                            np_array = np.array(
495 496 497 498 499
                                param._grad_ivar().value().get_tensor()
                            )
                            dy_grad_value[
                                param.name + core.grad_var_suffix()
                            ] = np_array
500 501 502 503 504 505 506

                    optimizer.minimize(avg_loss)
                    ocr_attention.clear_gradients()
                    dy_param_value = {}
                    for param in ocr_attention.parameters():
                        dy_param_value[param.name] = param.numpy()

H
hong 已提交
507 508 509 510 511 512
            return dy_out, dy_param_init_value, dy_param_value

        with fluid.dygraph.guard():
            dy_out, dy_param_init_value, dy_param_value = run_dygraph()

        with fluid.dygraph.guard():
513 514 515 516 517
            (
                eager_out,
                eager_param_init_value,
                eager_param_value,
            ) = run_dygraph()
H
hong 已提交
518

519
        with new_program_scope():
C
cnn 已提交
520
            paddle.seed(seed)
L
Leo Chen 已提交
521
            paddle.framework.random._manual_program_seed(seed)
522 523 524 525 526
            exe = fluid.Executor(
                fluid.CPUPlace()
                if not core.is_compiled_with_cuda()
                else fluid.CUDAPlace(0)
            )
527
            ocr_attention = OCRAttention()
528 529

            if Config.learning_rate_decay == "piecewise_decay":
D
Difer 已提交
530
                learning_rate = paddle.optimizer.lr.piecewise_decay(
531 532
                    [50000], [Config.LR, Config.LR * 0.01]
                )
533 534 535
            else:
                learning_rate = Config.LR

L
LoneRanger 已提交
536
            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
537

G
GGBond8488 已提交
538 539
            images = paddle.static.data(
                name='pixel', shape=[-1] + Config.DATA_SHAPE, dtype='float32'
540
            )
G
GGBond8488 已提交
541 542 543
            images.desc.set_need_check_feed(False)
            static_label_in = paddle.static.data(
                name='label_in', shape=[-1, 1], dtype='int64', lod_level=0
544
            )
G
GGBond8488 已提交
545 546 547
            static_label_in.desc.set_need_check_feed(False)
            static_label_out = paddle.static.data(
                name='label_out', shape=[-1, 1], dtype='int64', lod_level=0
548
            )
G
GGBond8488 已提交
549 550
            static_label_out.desc.set_need_check_feed(False)

551
            static_label_out.stop_gradient = True
552 553 554 555
            static_label_out.trainable = False

            static_prediction = ocr_attention(images, static_label_in)

556
            static_prediction = paddle.reshape(
557 558
                static_prediction, shape=[-1, Config.num_classes + 2]
            )
559

560 561 562 563 564
            cost = paddle.nn.functional.cross_entropy(
                input=static_prediction,
                label=static_label_out,
                reduction='none',
                use_softmax=False,
565
            )
566
            static_avg_loss = paddle.sum(cost)
567 568 569 570 571 572 573 574 575
            # param_grad_list = fluid.backward.append_backward(static_avg_loss)
            optimizer.minimize(static_avg_loss)

            static_param_init_value = {}
            static_param_name_list = []
            static_grad_name_list = []
            for param in ocr_attention.parameters():
                static_param_name_list.append(param.name)
                if param.trainable:
576 577 578
                    static_grad_name_list.append(
                        param.name + core.grad_var_suffix()
                    )
579

580 581 582 583
            out = exe.run(
                fluid.default_startup_program(),
                fetch_list=static_param_name_list,
            )
584 585 586 587 588 589 590 591 592 593 594 595

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            fetch_list = [static_avg_loss.name]
            fetch_list.extend(static_param_name_list)
            fetch_list.extend(static_grad_name_list)
            for epoch in range(epoch_num):
                for batch_id in range(batch_num):
                    static_label_in = label_in_np
                    static_label_out = label_out_np
                    static_label_out = static_label_out.reshape((-1, 1))
596 597 598 599 600 601 602 603 604
                    out = exe.run(
                        fluid.default_main_program(),
                        feed={
                            "pixel": image_np,
                            "label_in": static_label_in,
                            "label_out": static_label_out,
                        },
                        fetch_list=fetch_list,
                    )
605 606 607 608
                    static_param_value = {}
                    static_grad_value = {}
                    static_out = out[0]
                    for i in range(1, len(static_param_name_list) + 1):
609 610 611
                        static_param_value[static_param_name_list[i - 1]] = out[
                            i
                        ]
612
                    grad_start_pos = len(static_param_name_list) + 1
613 614 615 616 617 618 619
                    for i in range(
                        grad_start_pos,
                        len(static_grad_name_list) + grad_start_pos,
                    ):
                        static_grad_value[
                            static_grad_name_list[i - grad_start_pos]
                        ] = out[i]
620

621
        np.testing.assert_allclose(static_out, dy_out, rtol=1e-05, atol=1e-8)
622

623
        for key, value in static_param_init_value.items():
624
            np.testing.assert_array_equal(value, dy_param_init_value[key])
625

626
        for key, value in static_param_value.items():
627 628 629
            np.testing.assert_allclose(
                value, dy_param_value[key], rtol=1e-05, atol=1e-8
            )
630

H
hong 已提交
631
        # check eager here
632
        np.testing.assert_allclose(static_out, eager_out, rtol=1e-05, atol=1e-8)
H
hong 已提交
633

634
        for key, value in static_param_init_value.items():
635
            np.testing.assert_array_equal(value, eager_param_init_value[key])
H
hong 已提交
636

637
        for key, value in static_param_value.items():
638 639 640
            np.testing.assert_allclose(
                value, eager_param_value[key], rtol=1e-05, atol=1e-8
            )
H
hong 已提交
641

642 643

if __name__ == '__main__':
644
    paddle.enable_static()
645
    unittest.main()