diff --git a/PaddleCV/ocr_recognition/attention_model.py b/PaddleCV/ocr_recognition/attention_model.py index 963d2168fd6ec4f53724573895344825c18558a3..3ee58e5c5b916b17ce19f9abba9604397c47f6d6 100755 --- a/PaddleCV/ocr_recognition/attention_model.py +++ b/PaddleCV/ocr_recognition/attention_model.py @@ -16,6 +16,7 @@ from __future__ import division from __future__ import print_function import paddle.fluid as fluid import six +import numpy as np decoder_size = 128 word_vector_dim = 128 @@ -24,6 +25,7 @@ sos = 0 eos = 1 beam_size = 1 + def conv_bn_pool(input, group, out_ch, @@ -79,32 +81,42 @@ def encoder_net(images, rnn_hidden_size=200, is_test=False, use_cudnn=True): stride=[1, 1], filter_size=[conv_features.shape[2], 1]) + pad_value = fluid.layers.assign(input=np.array([0.0], dtype=np.float32)) + sliced_feature_pad, output_len = fluid.layers.sequence_pad( + sliced_feature, pad_value, maxlen=48) + para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02)) bias_attr = fluid.ParamAttr( initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) - fc_1 = fluid.layers.fc(input=sliced_feature, + fc_1 = fluid.layers.fc(input=sliced_feature_pad, size=rnn_hidden_size * 3, param_attr=para_attr, - bias_attr=False) - fc_2 = fluid.layers.fc(input=sliced_feature, + bias_attr=False, + num_flatten_dims=2) + fc_2 = fluid.layers.fc(input=sliced_feature_pad, size=rnn_hidden_size * 3, param_attr=para_attr, - bias_attr=False) - - gru_forward = fluid.layers.dynamic_gru( - input=fc_1, - size=rnn_hidden_size, + bias_attr=False, + num_flatten_dims=2) + gru_cell_forward = fluid.layers.GRUCell( + hidden_size=rnn_hidden_size, param_attr=para_attr, bias_attr=bias_attr, - candidate_activation='relu') - gru_backward = fluid.layers.dynamic_gru( - input=fc_2, - size=rnn_hidden_size, - is_reverse=True, + activation=fluid.layers.relu) + gru_cell_backward = fluid.layers.GRUCell( + hidden_size=rnn_hidden_size, param_attr=para_attr, bias_attr=bias_attr, - candidate_activation='relu') + activation=fluid.layers.relu) + gru_forward, _ = fluid.layers.rnn(gru_cell_forward, inputs=fc_1) + gru_backward, _ = fluid.layers.rnn(gru_cell_backward, + inputs=fc_2, + is_reverse=True) + output_len = fluid.layers.reshape(output_len, [-1]) + gru_forward = fluid.layers.sequence_unpad(x=gru_forward, length=output_len) + gru_backward = fluid.layers.sequence_unpad( + x=gru_backward, length=output_len) encoded_vector = fluid.layers.concat( input=[gru_forward, gru_backward], axis=1) @@ -123,7 +135,9 @@ def gru_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, bias_attr=False) decoder_state_expand = fluid.layers.sequence_expand( x=decoder_state_proj, y=encoder_proj) - concated = encoder_proj + decoder_state_expand + #concated = encoder_proj + decoder_state_expand + concated = fluid.layers.elementwise_add(encoder_proj, + decoder_state_expand) concated = fluid.layers.tanh(x=concated) attention_weights = fluid.layers.fc(input=concated, size=1, @@ -137,13 +151,19 @@ def gru_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') return context - rnn = fluid.layers.DynamicRNN() + pad_value = fluid.layers.assign(np.array([0.0], dtype=np.float32)) + target_embedding_pad, target_embedding_length = fluid.layers.sequence_pad( + target_embedding, pad_value) + target_embedding_length = fluid.layers.reshape(target_embedding_length, + [-1]) - with rnn.block(): - current_word = rnn.step_input(target_embedding) - encoder_vec = rnn.static_input(encoder_vec) - encoder_proj = rnn.static_input(encoder_proj) - hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + target_embedding_pad = fluid.layers.transpose(target_embedding_pad, + [1, 0, 2]) + rnn = fluid.layers.StaticRNN() + + with rnn.step(): + current_word = rnn.step_input(target_embedding_pad) + hidden_mem = rnn.memory(init=decoder_boot) context = simple_attention(encoder_vec, encoder_proj, hidden_mem) fc_1 = fluid.layers.fc(input=context, size=decoder_size * 3, @@ -159,8 +179,13 @@ def gru_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, size=num_classes + 2, bias_attr=True, act='softmax') - rnn.output(out) - return rnn() + rnn.step_output(out) + rnn_out = rnn() + rnn_out = fluid.layers.transpose(rnn_out, [1, 0, 2]) + rnn_out = fluid.layers.sequence_unpad( + x=rnn_out, length=target_embedding_length) + + return rnn_out def attention_train_net(args, data_shape, num_classes): @@ -188,7 +213,8 @@ def attention_train_net(args, data_shape, num_classes): prediction = gru_decoder_with_attention(trg_embedding, encoded_vector, encoded_proj, decoder_boot, decoder_size, num_classes) - fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(args.gradient_clip)) + fluid.clip.set_gradient_clip( + fluid.clip.GradientClipByValue(args.gradient_clip)) label_out = fluid.layers.cast(x=label_out, dtype='int64') _, maxid = fluid.layers.topk(input=prediction, k=1)