Update Additive Attention followed by GRU

27afc286 · guosheng · e4e393c8 · 27afc286 · 27afc286
显示空白变更内容
内联并排

Showing with 244 addition and 164 deletion

seq2seq/seq2seq_add_attn.py seq2seq/seq2seq_add_attn.py +104 -164

seq2seq/train_ocr.py seq2seq/train_ocr.py +140 -0

未找到文件。
--- a/seq2seq/seq2seq_add_attn.py
+++ b/seq2seq/seq2seq_add_attn.py
 import numpy as np
 import paddle.fluid as fluid
+import paddle.fluid.layers as layers
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear, BatchNorm, Embedding, GRUUnit
-from text import DynamicDecode, RNN, BasicLSTMCell, RNNCell
+from text import DynamicDecode, RNN, RNNCell
 from model import Model, Loss
@@ -91,82 +92,70 @@ class OCRConv(fluid.dygraph.Layer):
        return inputs_4
+class SimpleAttention(fluid.dygraph.Layer):
+    def __init__(self, decoder_size):
+        super(SimpleAttention, self).__init__()
+        self.fc1 = Linear(decoder_size, decoder_size, bias_attr=False)
+        self.fc2 = Linear(decoder_size, 1, bias_attr=False)
+    def forward(self, encoder_vec, encoder_proj, decoder_state):
+        decoder_state = self.fc1(decoder_state)
+        decoder_state = fluid.layers.unsqueeze(decoder_state, [1])
+        mix = fluid.layers.elementwise_add(encoder_proj, decoder_state)
+        mix = fluid.layers.tanh(x=mix)
+        attn_score = self.fc2(mix)
+        attn_scores = layers.squeeze(attn_score, [2])
+        attn_scores = fluid.layers.softmax(attn_scores)
+        scaled = fluid.layers.elementwise_mul(
+            x=encoder_vec, y=attn_scores, axis=0)
+        context = fluid.layers.reduce_sum(scaled, dim=1)
+        return context
 class GRUCell(RNNCell):
    def __init__(self,
-                 size,
+                 input_size,
+                 hidden_size,
                 param_attr=None,
                 bias_attr=None,
-                 is_reverse=False,
                 gate_activation='sigmoid',
                 candidate_activation='tanh',
-                 origin_mode=False,
+                 origin_mode=False):
-                 init_size=None):
        super(GRUCell, self).__init__()
+        self.hidden_size = hidden_size
-        self.input_proj = Linear(
+        self.fc_layer = Linear(
-            768, size * 3, param_attr=param_attr, bias_attr=False)
+            input_size,
+            hidden_size * 3,
+            param_attr=param_attr,
+            bias_attr=False)
        self.gru_unit = GRUUnit(
-            size * 3,
+            hidden_size * 3,
            param_attr=param_attr,
            bias_attr=bias_attr,
            activation=candidate_activation,
            gate_activation=gate_activation,
            origin_mode=origin_mode)
-        self.size = size
-        self.is_reverse = is_reverse
    def forward(self, inputs, states):
        # step_outputs, new_states = cell(step_inputs, states)
        # for GRUCell, `step_outputs` and `new_states` both are hidden
-        x = self.input_proj(inputs)
+        x = self.fc_layer(inputs)
        hidden, _, _ = self.gru_unit(x, states)
        return hidden, hidden
+    @property
-class DecoderCell(RNNCell):
+    def state_shape(self):
-    def __init__(self, size):
+        return [self.hidden_size]
-        self.gru = GRUCell(size)
-        self.attention = SimpleAttention(size)
-        self.fc_1_layer = Linear(
-            input_dim=size * 2, output_dim=size * 3, bias_attr=False)
-        self.fc_2_layer = Linear(
-            input_dim=size, output_dim=size * 3, bias_attr=False)
-    def forward(self, inputs, states, encoder_vec, encoder_proj):
-        context = self.attention(encoder_vec, encoder_proj, states)
-        fc_1 = self.fc_1_layer(context)
-        fc_2 = self.fc_2_layer(inputs)
-        decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2)
-        h, _ = self.gru(decoder_inputs, states)
-        return h, h
-class Decoder(fluid.dygraph.Layer):
-    def __init__(self, size, num_classes):
-        super(Decoder, self).__init__()
-        self.embedder = Embedding(size=[num_classes, size])
-        self.gru_attention = RNN(DecoderCell(size),
-                                 is_reverse=False,
-                                 time_major=False)
-        self.output_layer = Linear(size, num_classes, bias_attr=False)
-    def forward(self, target, decoder_initial_states, encoder_vec,
-                encoder_proj):
-        inputs = self.embedder(target)
-        decoder_output, _ = self.gru_attention(
-            inputs,
-            initial_states=decoder_initial_states,
-            encoder_vec=encoder_vec,
-            encoder_proj=encoder_proj)
-        predict = self.output_layer(decoder_output)
-        return predict
 class EncoderNet(fluid.dygraph.Layer):
    def __init__(self,
-                 batch_size,
                 decoder_size,
                 rnn_hidden_size=200,
                 is_test=False,
@@ -179,21 +168,24 @@ class EncoderNet(fluid.dygraph.Layer):
            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
        self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)
-        self.fc_1_layer = Linear(
+        self.gru_forward_layer = RNN(
-            768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
+            cell=GRUCell(
-        self.fc_2_layer = Linear(
+                input_size=128 * 6,  # channel * h
-            768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
+                hidden_size=rnn_hidden_size,
-        self.gru_forward_layer = DynamicGRU(
-            size=rnn_hidden_size,
                param_attr=para_attr,
                bias_attr=bias_attr,
-            candidate_activation='relu')
+                candidate_activation='relu'),
-        self.gru_backward_layer = DynamicGRU(
+            is_reverse=False,
-            size=rnn_hidden_size,
+            time_major=False)
+        self.gru_backward_layer = RNN(
+            cell=GRUCell(
+                input_size=128 * 6,  # channel * h
+                hidden_size=rnn_hidden_size,
                param_attr=para_attr,
                bias_attr=bias_attr,
-            candidate_activation='relu',
+                candidate_activation='relu'),
-            is_reverse=True)
+            is_reverse=True,
+            time_major=False)
        self.encoded_proj_fc = Linear(
            rnn_hidden_size * 2, decoder_size, bias_attr=False)
@@ -211,13 +203,9 @@ class EncoderNet(fluid.dygraph.Layer):
            ],
            inplace=False)
-        fc_1 = self.fc_1_layer(sliced_feature)
+        gru_forward, _ = self.gru_forward_layer(sliced_feature)
-        fc_2 = self.fc_2_layer(sliced_feature)
-        gru_forward = self.gru_forward_layer(fc_1)
+        gru_backward, _ = self.gru_backward_layer(sliced_feature)
-        gru_backward = self.gru_backward_layer(fc_2)
        encoded_vector = fluid.layers.concat(
            input=[gru_forward, gru_backward], axis=2)
@@ -227,88 +215,50 @@ class EncoderNet(fluid.dygraph.Layer):
        return gru_backward, encoded_vector, encoded_proj
-class SimpleAttention(fluid.dygraph.Layer):
+class DecoderCell(RNNCell):
-    def __init__(self, decoder_size):
+    def __init__(self, encoder_size, decoder_size):
-        super(SimpleAttention, self).__init__()
+        super(DecoderCell, self).__init__()
+        self.attention = SimpleAttention(decoder_size)
-        self.fc_1 = Linear(
+        self.gru_cell = GRUCell(
-            decoder_size, decoder_size, act=None, bias_attr=False)
+            input_size=encoder_size * 2 +
-        self.fc_2 = Linear(decoder_size, 1, act=None, bias_attr=False)
+            decoder_size,  # encoded_vector.shape[-1] + embed_size
+            hidden_size=decoder_size)
-    def forward(self, encoder_vec, encoder_proj, decoder_state):
+    def forward(self, current_word, states, encoder_vec, encoder_proj):
-        decoder_state_fc = self.fc_1(decoder_state)
+        context = self.attention(encoder_vec, encoder_proj, states)
+        decoder_inputs = layers.concat([current_word, context], axis=1)
-        decoder_state_proj_reshape = fluid.layers.reshape(
+        hidden, _ = self.gru_cell(decoder_inputs, states)
-            decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]],
+        return hidden, hidden
-            inplace=False)
-        decoder_state_expand = fluid.layers.expand(
-            decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1])
-        concated = fluid.layers.elementwise_add(encoder_proj,
-                                                decoder_state_expand)
-        concated = fluid.layers.tanh(x=concated)
-        attention_weight = self.fc_2(concated)
-        weights_reshape = fluid.layers.reshape(
-            x=attention_weight, shape=[concated.shape[0], -1], inplace=False)
-        weights_reshape = fluid.layers.softmax(weights_reshape)
-        scaled = fluid.layers.elementwise_mul(
-            x=encoder_vec, y=weights_reshape, axis=0)
-        context = fluid.layers.reduce_sum(scaled, dim=1)
-        return context
 class GRUDecoderWithAttention(fluid.dygraph.Layer):
    def __init__(self, encoder_size, decoder_size, num_classes):
        super(GRUDecoderWithAttention, self).__init__()
-        self.simple_attention = SimpleAttention(decoder_size)
+        self.gru_attention = RNN(DecoderCell(encoder_size, decoder_size),
+                                 is_reverse=False,
-        self.fc_1_layer = Linear(
+                                 time_major=False)
-            input_dim=encoder_size * 2,
-            output_dim=decoder_size * 3,
-            bias_attr=False)
-        self.fc_2_layer = Linear(
-            input_dim=decoder_size,
-            output_dim=decoder_size * 3,
-            bias_attr=False)
-        self.gru_unit = GRUUnit(
-            size=decoder_size * 3, param_attr=None, bias_attr=None)
        self.out_layer = Linear(
            input_dim=decoder_size,
            output_dim=num_classes + 2,
            bias_attr=None,
            act='softmax')
-        self.decoder_size = decoder_size
+    def forward(self, inputs, decoder_initial_states, encoder_vec,
+                encoder_proj):
-    def forward(self,
+        out, _ = self.gru_attention(
-                current_word,
+            inputs,
-                encoder_vec,
+            initial_states=decoder_initial_states,
-                encoder_proj,
+            encoder_vec=encoder_vec,
-                decoder_boot,
+            encoder_proj=encoder_proj)
-                inference=False):
+        predict = self.out_layer(out)
-        current_word = fluid.layers.reshape(
+        return predict
-            current_word, [-1, current_word.shape[2]], inplace=False)
-        context = self.simple_attention(encoder_vec, encoder_proj,
-                                        decoder_boot)
-        fc_1 = self.fc_1_layer(context)
-        fc_2 = self.fc_2_layer(current_word)
-        decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2)
-        h, _, _ = self.gru_unit(decoder_inputs, decoder_boot)
-        out = self.out_layer(h)
-        return out, h
-class OCRAttention(fluid.dygraph.Layer):
+class OCRAttention(Model):
-    def __init__(self, batch_size, num_classes, encoder_size, decoder_size,
+    def __init__(self, num_classes, encoder_size, decoder_size,
                 word_vector_dim):
        super(OCRAttention, self).__init__()
-        self.encoder_net = EncoderNet(batch_size, decoder_size)
+        self.encoder_net = EncoderNet(decoder_size)
        self.fc = Linear(
            input_dim=encoder_size,
            output_dim=decoder_size,
@@ -318,36 +268,26 @@ class OCRAttention(fluid.dygraph.Layer):
            [num_classes + 2, word_vector_dim], dtype='float32')
        self.gru_decoder_with_attention = GRUDecoderWithAttention(
            encoder_size, decoder_size, num_classes)
-        self.batch_size = batch_size
    def forward(self, inputs, label_in):
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
-        backward_first = fluid.layers.slice(
-            gru_backward, axes=[1], starts=[0], ends=[1])
-        backward_first = fluid.layers.reshape(
-            backward_first, [-1, backward_first.shape[2]], inplace=False)
-        decoder_boot = self.fc(backward_first)
-        label_in = fluid.layers.reshape(label_in, [-1], inplace=False)
+        decoder_boot = self.fc(gru_backward[:, 0])
        trg_embedding = self.embedding(label_in)
+        prediction = self.gru_decoder_with_attention(
+            trg_embedding, decoder_boot, encoded_vector, encoded_proj)
-        trg_embedding = fluid.layers.reshape(
+        return prediction
-            trg_embedding, [self.batch_size, -1, trg_embedding.shape[1]],
-            inplace=False)
-        pred_temp = []
-        for i in range(trg_embedding.shape[1]):
-            current_word = fluid.layers.slice(
-                trg_embedding, axes=[1], starts=[i], ends=[i + 1])
-            out, decoder_boot = self.gru_decoder_with_attention(
-                current_word, encoded_vector, encoded_proj, decoder_boot)
-            pred_temp.append(out)
-        pred_temp = fluid.layers.concat(pred_temp, axis=1)
-        batch_size = trg_embedding.shape[0]
+class CrossEntropyCriterion(Loss):
-        seq_len = trg_embedding.shape[1]
+    def __init__(self):
-        prediction = fluid.layers.reshape(
+        super(CrossEntropyCriterion, self).__init__()
-            pred_temp, shape=[batch_size, seq_len, -1])
-        return prediction
+    def forward(self, outputs, labels):
+        predict, (label, mask) = outputs[0], labels
+        loss = layers.cross_entropy(predict, label=label, soft_label=False)
+        loss = layers.elementwise_mul(loss, mask, axis=0)
+        loss = layers.reduce_sum(loss)
+        return loss
--- a/seq2seq/train_ocr.py
+++ b/seq2seq/train_ocr.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+import data_reader
+from paddle.fluid.dygraph.base import to_variable
+import argparse
+import functools
+from utility import add_arguments, print_arguments, get_attention_feeder_data
+from model import Input, set_device
+from nets import OCRAttention, CrossEntropyCriterion
+from eval import evaluate
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,         "Minibatch size.")
+add_arg('epoch_num',         int,   30,         "Epoch number.")
+add_arg('lr',                float, 0.001,         "Learning rate.")
+add_arg('lr_decay_strategy', str,   "", "Learning rate decay strategy.")
+add_arg('log_period',        int,   200,       "Log period.")
+add_arg('save_model_period', int,   2000,      "Save model period. '-1' means never saving the model.")
+add_arg('eval_period',       int,   2000,      "Evaluate period. '-1' means never evaluating the model.")
+add_arg('save_model_dir',    str,   "./output", "The directory the model to be saved to.")
+add_arg('train_images',      str,   None,       "The directory of images to be used for training.")
+add_arg('train_list',        str,   None,       "The list file of images to be used for training.")
+add_arg('test_images',       str,   None,       "The directory of images to be used for test.")
+add_arg('test_list',         str,   None,       "The list file of images to be used for training.")
+add_arg('init_model',        str,   None,       "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,      "Whether use GPU to train.")
+add_arg('parallel',          bool,  False,     "Whether use parallel training.")
+add_arg('profile',           bool,  False,      "Whether to use profiling.")
+add_arg('skip_batch_num',    int,   0,          "The number of first minibatches to skip as warm-up for better performance test.")
+add_arg('skip_test',         bool,  False,      "Whether to skip test phase.")
+# model hyper paramters
+add_arg('encoder_size',      int,   200,     "Encoder size.")
+add_arg('decoder_size',      int,   128,     "Decoder size.")
+add_arg('word_vector_dim',   int,   128,     "Word vector dim.")
+add_arg('num_classes',       int,   95,     "Number classes.")
+add_arg('gradient_clip',     float, 5.0,     "Gradient clip value.")
+add_arg('dynamic',           bool,  False,      "Whether to use dygraph.")
+def train(args):
+    device = set_device("gpu" if args.use_gpu else "cpu")
+    fluid.enable_dygraph(device) if args.dynamic else None
+    ocr_attention = OCRAttention(encoder_size=args.encoder_size, decoder_size=args.decoder_size,
+                                 num_classes=args.num_classes, word_vector_dim=args.word_vector_dim)
+    LR = args.lr
+    if args.lr_decay_strategy == "piecewise_decay":
+        learning_rate = fluid.layers.piecewise_decay([200000, 250000], [LR, LR * 0.1, LR * 0.01])
+    else:
+        learning_rate = LR
+    optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, parameter_list=ocr_attention.parameters())
+    # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(args.gradient_clip)
+    inputs = [
+        Input([None, 1, 48, 384], "float32", name="pixel"),
+        Input([None, None], "int64", name="label_in"),
+    ]
+    labels = [
+        Input([None, None], "int64", name="label_out"),
+        Input([None, None], "float32", name="mask")]
+    ocr_attention.prepare(optimizer, CrossEntropyCriterion(), inputs=inputs, labels=labels)
+    train_reader = data_reader.data_reader(
+        args.batch_size,
+        shuffle=True,
+        images_dir=args.train_images,
+        list_file=args.train_list,
+        data_type='train')
+    # test_reader = data_reader.data_reader(
+    #         args.batch_size,
+    #         images_dir=args.test_images,
+    #         list_file=args.test_list,
+    #         data_type="test")
+    # if not os.path.exists(args.save_model_dir):
+    #     os.makedirs(args.save_model_dir)
+    total_step = 0
+    epoch_num = args.epoch_num
+    for epoch in range(epoch_num):
+        batch_id = 0
+        total_loss = 0.0
+        for data in train_reader():
+            total_step += 1
+            data_dict = get_attention_feeder_data(data)
+            pixel = data_dict["pixel"]
+            label_in = data_dict["label_in"].reshape([pixel.shape[0], -1])
+            label_out = data_dict["label_out"].reshape([pixel.shape[0], -1])
+            mask = data_dict["mask"].reshape(label_out.shape).astype("float32")
+            avg_loss = ocr_attention.train(inputs=[pixel, label_in], labels=[label_out, mask])[0]
+            total_loss += avg_loss
+            if True:#batch_id > 0 and batch_id % args.log_period == 0:
+                print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id,
+                                                                total_loss / args.batch_size / args.log_period))
+                total_loss = 0.0
+            batch_id += 1
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.profile:
+        if args.use_gpu:
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(args)
+        else:
+            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
+                train(args)
+    else:
+        train(args)
\ No newline at end of file