diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index b621330579ca647850f1524fa0e5c4fc891aa914..12f07f17bb54793168d5fac772ccf9c717609cec 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -82,6 +82,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) list(REMOVE_ITEM TEST_OPS test_layers) +list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model) # Some ops need to check results when gc is enabled # Currently, only ops that register NoNeedBufferVarsInference need to do this test @@ -137,7 +138,7 @@ py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mn FLAGS_cudnn_deterministic=1) py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS FLAGS_cudnn_deterministic=1) - +py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS FLAGS_cudnn_deterministic=1) if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py new file mode 100644 index 0000000000000000000000000000000000000000..f63c82856bbcc0a0741e563c251f547361432daa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -0,0 +1,600 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import contextlib +import unittest +import numpy as np +import six +import os +from PIL import Image +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + + +class Config(object): + ''' + config for training + ''' + # decoder size for decoder stage + decoder_size = 128 + # size for word embedding + word_vector_dim = 128 + # max length for label padding + max_length = 15 + # optimizer setting + LR = 1.0 + learning_rate_decay = None + + # batch size to train + batch_size = 32 + # class number to classify + num_classes = 481 + + use_gpu = False + # special label for start and end + SOS = 0 + EOS = 1 + # settings for ctc data, not use in unittest + DATA_DIR_NAME = "./dataset/ctc_data/data" + TRAIN_DATA_DIR_NAME = "train_images" + TRAIN_LIST_FILE_NAME = "train.list" + + # data shape for input image + DATA_SHAPE = [1, 48, 384] + + +class ConvBNPool(fluid.dygraph.Layer): + def __init__(self, + name_scope, + group, + out_ch, + channels, + act="relu", + is_test=False, + pool=True, + use_cudnn=True): + super(ConvBNPool, self).__init__(name_scope) + self.group = group + self.pool = pool + + filter_size = 3 + conv_std_0 = (2.0 / (filter_size**2 * channels[0]))**0.5 + conv_param_0 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, conv_std_0)) + + conv_std_1 = (2.0 / (filter_size**2 * channels[1]))**0.5 + conv_param_1 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, conv_std_1)) + + self.conv_0_layer = Conv2D( + self.full_name(), + channels[0], + out_ch[0], + 3, + padding=1, + param_attr=conv_param_0, + bias_attr=None, + act=None, + use_cudnn=use_cudnn) + self.bn_0_layer = BatchNorm( + self.full_name(), out_ch[0], act=act, is_test=is_test) + self.conv_1_layer = Conv2D( + self.full_name(), + num_channels=channels[1], + num_filters=out_ch[1], + filter_size=3, + padding=1, + param_attr=conv_param_1, + bias_attr=None, + act=None, + use_cudnn=use_cudnn) + self.bn_1_layer = BatchNorm( + self.full_name(), out_ch[1], act=act, is_test=is_test) + + if self.pool: + self.pool_layer = Pool2D( + self.full_name(), + pool_size=2, + pool_type='max', + pool_stride=2, + use_cudnn=use_cudnn, + ceil_mode=True) + + def forward(self, inputs): + conv_0 = self.conv_0_layer(inputs) + bn_0 = self.bn_0_layer(conv_0) + conv_1 = self.conv_1_layer(bn_0) + bn_1 = self.bn_1_layer(conv_1) + if self.pool: + bn_pool = self.pool_layer(bn_1) + return bn_pool + return bn_1 + + +class OCRConv(fluid.dygraph.Layer): + def __init__(self, name_scope, is_test=False, use_cudnn=True): + super(OCRConv, self).__init__(name_scope) + self.conv_bn_pool_1 = ConvBNPool( + self.full_name(), + 2, [16, 16], [1, 16], + is_test=is_test, + use_cudnn=use_cudnn) + self.conv_bn_pool_2 = ConvBNPool( + self.full_name(), + 2, [32, 32], [16, 32], + is_test=is_test, + use_cudnn=use_cudnn) + self.conv_bn_pool_3 = ConvBNPool( + self.full_name(), + 2, [64, 64], [32, 64], + is_test=is_test, + use_cudnn=use_cudnn) + self.conv_bn_pool_4 = ConvBNPool( + self.full_name(), + 2, [128, 128], [64, 128], + is_test=is_test, + pool=False, + use_cudnn=use_cudnn) + + def forward(self, inputs): + inputs_1 = self.conv_bn_pool_1(inputs) + inputs_2 = self.conv_bn_pool_2(inputs_1) + inputs_3 = self.conv_bn_pool_3(inputs_2) + inputs_4 = self.conv_bn_pool_4(inputs_3) + + return inputs_4 + + +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + scope_name, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False): + super(DynamicGRU, self).__init__(scope_name) + + self.gru_unit = GRUUnit( + self.full_name(), + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + + self.h_0 = h_0 + self.is_reverse = is_reverse + + def forward(self, inputs): + hidden = self.h_0 + res = [] + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + input_ = fluid.layers.slice( + inputs, axes=[1], starts=[i], ends=[i + 1]) + input_ = fluid.layers.reshape( + input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + hidden_ = fluid.layers.reshape( + hidden, [-1, 1, hidden.shape[1]], inplace=False) + if self.is_reverse: + res = [hidden_] + res + else: + res.append(hidden_) + res = fluid.layers.concat(res, axis=1) + return res + + +class EncoderNet(fluid.dygraph.Layer): + def __init__(self, + scope_name, + rnn_hidden_size=200, + is_test=False, + use_cudnn=True): + super(EncoderNet, self).__init__(scope_name) + self.rnn_hidden_size = rnn_hidden_size + para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, + 0.02)) + bias_attr = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) + if fluid.framework.in_dygraph_mode(): + h_0 = np.zeros( + (Config.batch_size, rnn_hidden_size), dtype="float32") + h_0 = to_variable(h_0) + else: + h_0 = fluid.layers.fill_constant( + shape=[Config.batch_size, rnn_hidden_size], + dtype='float32', + value=0) + self.ocr_convs = OCRConv( + self.full_name(), is_test=is_test, use_cudnn=use_cudnn) + + self.fc_1_layer = FC(self.full_name(), + rnn_hidden_size * 3, + param_attr=para_attr, + bias_attr=False, + num_flatten_dims=2) + self.fc_2_layer = FC(self.full_name(), + rnn_hidden_size * 3, + param_attr=para_attr, + bias_attr=False, + num_flatten_dims=2) + self.gru_forward_layer = DynamicGRU( + self.full_name(), + size=rnn_hidden_size, + h_0=h_0, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu') + self.gru_backward_layer = DynamicGRU( + self.full_name(), + size=rnn_hidden_size, + h_0=h_0, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu', + is_reverse=True) + + self.encoded_proj_fc = FC(self.full_name(), + Config.decoder_size, + bias_attr=False, + num_flatten_dims=2) + + def forward(self, inputs): + conv_features = self.ocr_convs(inputs) + #sliced_feature = fluid.layers.im2sequence( + # input=conv_features, + # stride=[1, 1], + # filter_size=[conv_features.shape[2], 1]) + + transpose_conv_features = fluid.layers.transpose( + conv_features, perm=[0, 3, 1, 2]) + + sliced_feature = fluid.layers.reshape( + transpose_conv_features, [ + -1, 48, transpose_conv_features.shape[2] * + transpose_conv_features.shape[3] + ], + inplace=False) + fc_1 = self.fc_1_layer(sliced_feature) + fc_2 = self.fc_2_layer(sliced_feature) + gru_forward = self.gru_forward_layer(fc_1) + + gru_backward = self.gru_backward_layer(fc_2) + + encoded_vector = fluid.layers.concat( + input=[gru_forward, gru_backward], axis=2) + + encoded_proj = self.encoded_proj_fc(encoded_vector) + + return gru_backward, encoded_vector, encoded_proj + + +class SimpleAttention(fluid.dygraph.Layer): + def __init__(self, scope_name, decoder_size): + super(SimpleAttention, self).__init__(scope_name) + + self.fc_1 = FC(self.full_name(), + decoder_size, + act=None, + bias_attr=False) + self.fc_2 = FC(self.full_name(), 1, act=None, bias_attr=False) + + def _build_once(self, encoder_vec, encoder_proj, decoder_state): + pass + + def forward(self, encoder_vec, encoder_proj, decoder_state): + + decoder_state_fc = self.fc_1(decoder_state) + decoder_state_proj_reshape = fluid.layers.reshape( + decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], inplace=False) + decoder_state_expand = fluid.layers.expand( + decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1]) + concated = fluid.layers.elementwise_add(encoder_proj, + decoder_state_expand) + concated = fluid.layers.tanh(x=concated) + attention_weight = self.fc_2(concated) + weights_reshape = fluid.layers.reshape( + x=attention_weight, shape=[-1], inplace=False) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weights_reshape, axis=0) + scaled = fluid.layers.transpose(scaled, [0, 2, 1]) + scaled = fluid.layers.reshape( + scaled, [-1, scaled.shape[1], scaled.shape[2], 1], inplace=False) + context = fluid.layers.pool2d( + input=scaled, + pool_size=[scaled.shape[2], scaled.shape[3]], + pool_type='avg') + context = fluid.layers.reshape( + context, [-1, context.shape[1]], inplace=False) + return context + + +class GRUDecoderWithAttention(fluid.dygraph.Layer): + def __init__(self, scope_name, decoder_size, num_classes): + super(GRUDecoderWithAttention, self).__init__(scope_name) + self.simple_attention = SimpleAttention(self.full_name(), decoder_size) + + self.fc_1_layer = FC(self.full_name(), + size=decoder_size * 3, + bias_attr=False) + self.fc_2_layer = FC(self.full_name(), + size=decoder_size * 3, + bias_attr=False) + self.gru_unit = GRUUnit( + self.full_name(), + size=decoder_size * 3, + param_attr=None, + bias_attr=None) + self.out_layer = FC(self.full_name(), + size=num_classes + 2, + bias_attr=None, + act='softmax') + + self.decoder_size = decoder_size + + def _build_once(self, target_embedding, encoder_vec, encoder_proj, + decoder_boot): + pass + + def forward(self, target_embedding, encoder_vec, encoder_proj, + decoder_boot): + res = [] + hidden_mem = decoder_boot + for i in range(target_embedding.shape[1]): + current_word = fluid.layers.slice( + target_embedding, axes=[1], starts=[i], ends=[i + 1]) + current_word = fluid.layers.reshape( + current_word, [-1, current_word.shape[2]], inplace=False) + + context = self.simple_attention(encoder_vec, encoder_proj, + hidden_mem) + fc_1 = self.fc_1_layer(context) + fc_2 = self.fc_2_layer(current_word) + decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2) + + h, _, _ = self.gru_unit(decoder_inputs, hidden_mem) + hidden_mem = h + out = self.out_layer(h) + res.append(out) + + res1 = fluid.layers.concat(res, axis=0) + + return res1 + + +class OCRAttention(fluid.dygraph.Layer): + def __init__(self, scope_name): + super(OCRAttention, self).__init__(scope_name) + self.encoder_net = EncoderNet(self.full_name()) + self.fc = FC(self.full_name(), + size=Config.decoder_size, + bias_attr=False, + act='relu') + self.embedding = Embedding( + self.full_name(), [Config.num_classes + 2, Config.word_vector_dim], + dtype='float32') + self.gru_decoder_with_attention = GRUDecoderWithAttention( + self.full_name(), Config.decoder_size, Config.num_classes) + + def _build_once(self, inputs, label_in): + pass + + def forward(self, inputs, label_in): + gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) + backward_first = fluid.layers.slice( + gru_backward, axes=[1], starts=[0], ends=[1]) + backward_first = fluid.layers.reshape( + backward_first, [-1, backward_first.shape[2]], inplace=False) + decoder_boot = self.fc(backward_first) + label_in = fluid.layers.reshape(label_in, [-1, 1], inplace=False) + trg_embedding = self.embedding(label_in) + + trg_embedding = fluid.layers.reshape( + trg_embedding, [-1, Config.max_length, trg_embedding.shape[1]], + inplace=False) + + prediction = self.gru_decoder_with_attention( + trg_embedding, encoded_vector, encoded_proj, decoder_boot) + + return prediction + + +class TestDygraphOCRAttention(unittest.TestCase): + def test_while_op(self): + seed = 90 + epoch_num = 2 + batch_num = 20 + np.random.seed = seed + image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0], + Config.DATA_SHAPE[1], + Config.DATA_SHAPE[2]).astype('float32') + label_in_np = np.arange( + 0, Config.max_length, + dtype='int64').reshape([1, Config.max_length]) + for i in range(2, Config.batch_size + 1): + label_in_np = np.vstack((label_in_np, np.arange( + (i - 1) * Config.max_length, + i * Config.max_length, + dtype='int64').reshape([1, Config.max_length]))) + + print(label_in_np.shape) + label_out_np = np.arange( + 0, Config.max_length, + dtype='int64').reshape([1, Config.max_length]) + for i in range(2, Config.batch_size + 1): + label_out_np = np.vstack((label_out_np, np.arange( + (i - 1) * Config.max_length, + i * Config.max_length, + dtype='int64').reshape([1, Config.max_length]))) + print(label_out_np.shape) + #if Config.use_gpu: + # place = fluid.CUDAPlace(0) + #else: + # place = fluid.CPUPlace() + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + backward_strategy = fluid.dygraph.BackwardStrategy() + backward_strategy.sort_sum_gradient = True + ocr_attention = OCRAttention("ocr_attention") + + if Config.learning_rate_decay == "piecewise_decay": + learning_rate = fluid.layers.piecewise_decay( + [50000], [Config.LR, Config.LR * 0.01]) + else: + learning_rate = Config.LR + #optimizer = fluid.optimizer.Adadelta(learning_rate=learning_rate, + # epsilon=1.0e-6, rho=0.9) + optimizer = fluid.optimizer.SGD(learning_rate=0.001) + # place = fluid.CPUPlace() + dy_param_init_value = {} + for param in ocr_attention.parameters(): + dy_param_init_value[param.name] = param.numpy() + for epoch in range(epoch_num): + for batch_id in range(batch_num): + label_in = to_variable(label_in_np) + label_out = to_variable(label_out_np) + label_out._stop_gradient = True + label_out.trainable = False + img = to_variable(image_np) + dy_prediction = ocr_attention(img, label_in) + label_out = fluid.layers.reshape( + label_out, [-1, 1], inplace=False) + loss = fluid.layers.cross_entropy( + input=dy_prediction, label=label_out) + avg_loss = fluid.layers.reduce_sum(loss) + + dy_out = avg_loss.numpy() + + if epoch == 0 and batch_id == 0: + for param in ocr_attention.parameters(): + if param.name not in dy_param_init_value: + dy_param_init_value[param.name] = param.numpy() + avg_loss.backward(backward_strategy) + dy_grad_value = {} + for param in ocr_attention.parameters(): + if param.trainable: + np_array = np.array(param._ivar._grad_ivar().value() + .get_tensor()) + dy_grad_value[param.name + core.grad_var_suffix( + )] = np_array + + optimizer.minimize(avg_loss) + ocr_attention.clear_gradients() + dy_param_value = {} + for param in ocr_attention.parameters(): + dy_param_value[param.name] = param.numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + # print("static start") + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + ocr_attention = OCRAttention("ocr_attention") + + if Config.learning_rate_decay == "piecewise_decay": + learning_rate = fluid.layers.piecewise_decay( + [50000], [Config.LR, Config.LR * 0.01]) + else: + learning_rate = Config.LR + + optimizer = fluid.optimizer.SGD(learning_rate=0.001) + + images = fluid.layers.data( + name='pixel', shape=Config.DATA_SHAPE, dtype='float32') + static_label_in = fluid.layers.data( + name='label_in', shape=[1], dtype='int64', lod_level=0) + static_label_out = fluid.layers.data( + name='label_out', shape=[1], dtype='int64', lod_level=0) + static_label_out._stop_gradient = True + static_label_out.trainable = False + + static_prediction = ocr_attention(images, static_label_in) + + cost = fluid.layers.cross_entropy( + input=static_prediction, label=static_label_out) + static_avg_loss = fluid.layers.reduce_sum(cost) + # param_grad_list = fluid.backward.append_backward(static_avg_loss) + optimizer.minimize(static_avg_loss) + + static_param_init_value = {} + static_param_name_list = [] + static_grad_name_list = [] + for param in ocr_attention.parameters(): + static_param_name_list.append(param.name) + if param.trainable: + static_grad_name_list.append(param.name + + core.grad_var_suffix()) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + fetch_list = [static_avg_loss.name] + # print(static_test.name) + # fetch_list = [static_avg_loss.name, static_test.name] + fetch_list.extend(static_param_name_list) + fetch_list.extend(static_grad_name_list) + for epoch in range(epoch_num): + for batch_id in range(batch_num): + static_label_in = label_in_np + static_label_out = label_out_np + static_label_out = static_label_out.reshape((-1, 1)) + out = exe.run(fluid.default_main_program(), + feed={ + "pixel": image_np, + "label_in": static_label_in, + "label_out": static_label_out + }, + fetch_list=fetch_list) + static_param_value = {} + static_grad_value = {} + static_out = out[0] + # static_test_grad = out[1] + for i in range(1, len(static_param_name_list) + 1): + static_param_value[static_param_name_list[i - 1]] = out[ + i] + grad_start_pos = len(static_param_name_list) + 1 + for i in range(grad_start_pos, + len(static_grad_name_list) + grad_start_pos): + static_grad_value[static_grad_name_list[ + i - grad_start_pos]] = out[i] + + self.assertTrue(np.array_equal(static_out, dy_out)) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.array_equal(value, dy_param_init_value[key])) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-20)) + + +if __name__ == '__main__': + unittest.main()