From 89f00e53630af9711e6606f4effb935211790599 Mon Sep 17 00:00:00 2001
From: Hongyu Liu <43953930+phlrain@users.noreply.github.com>
Date: Thu, 30 May 2019 11:24:34 +0800
Subject: [PATCH] Add dygraph ocr model (#2333)

* add pbt lm; test=develop

* add dynamic ocr recognition; test=develop
---
 dygraph/ocr_recognition/data_reader.py | 273 +++++++++++
 dygraph/ocr_recognition/debug.sh       |   4 +
 dygraph/ocr_recognition/train.py       | 607 +++++++++++++++++++++++++
 3 files changed, 884 insertions(+)
 create mode 100644 dygraph/ocr_recognition/data_reader.py
 create mode 100644 dygraph/ocr_recognition/debug.sh
 create mode 100644 dygraph/ocr_recognition/train.py

diff --git a/dygraph/ocr_recognition/data_reader.py b/dygraph/ocr_recognition/data_reader.py
new file mode 100644
index 00000000..00e98d12
--- /dev/null
+++ b/dygraph/ocr_recognition/data_reader.py
@@ -0,0 +1,273 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import cv2
+import tarfile
+import numpy as np
+from PIL import Image
+from os import path
+from paddle.dataset.image import load_image
+import paddle
+
+SOS = 0
+EOS = 1
+NUM_CLASSES = 95
+DATA_SHAPE = [1, 48, 512]
+
+DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5"
+DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz"
+CACHE_DIR_NAME = "ctc_data"
+SAVED_FILE_NAME = "data.tar.gz"
+DATA_DIR_NAME = "data"
+TRAIN_DATA_DIR_NAME = "train_images"
+TEST_DATA_DIR_NAME = "test_images"
+TRAIN_LIST_FILE_NAME = "train.list"
+TEST_LIST_FILE_NAME = "test.list"
+
+
+class DataGenerator(object):
+    def __init__(self, model="crnn_ctc"):
+        self.model = model
+
+    def train_reader(self,
+                     img_root_dir,
+                     img_label_list,
+                     batchsize,
+                     cycle,
+                     max_length,
+                     shuffle=True):
+        '''
+        Reader interface for training.
+
+        :param img_root_dir: The root path of the image for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for training.
+        :type img_label_list: str
+
+        :param cycle: If number of iterations is greater than dataset_size / batch_size
+        it reiterates dataset over as many times as necessary.
+        :type cycle: bool
+
+        '''
+
+        img_label_lines = []
+        to_file = "tmp.txt"
+        if not shuffle:
+            cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' > " + to_file
+        elif batchsize == 1:
+            cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' | shuf > " + to_file
+        else:
+            #cmd1: partial shuffle
+            cmd = "cat " + img_label_list + " | awk '{printf(\"%04d%.4f %s\\n\", $1, rand(), $0)}' | sort | sed 1,$((1 + RANDOM % 100))d | "
+            #cmd2: batch merge and shuffle
+            cmd += "awk '{printf $2\" \"$3\" \"$4\" \"$5\" \"; if(NR % " + str(
+                batchsize) + " == 0) print \"\";}' | shuf | "
+            #cmd3: batch split
+            cmd += "awk '{if(NF == " + str(
+                batchsize
+            ) + " * 4) {for(i = 0; i < " + str(
+                batchsize
+            ) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file
+        os.system(cmd)
+        print("finish batch shuffle")
+        img_label_lines = open(to_file, 'r').readlines()
+
+        def reader():
+            sizes = len(img_label_lines) // batchsize
+            if sizes == 0:
+                raise ValueError('batchsize is bigger than the dataset size.')
+            while True:
+                for i in range(sizes):
+                    result = []
+                    sz = [0, 0]
+                    max_len = 0
+                    for k in range(batchsize):
+                        line = img_label_lines[i * batchsize + k]
+                        items = line.split(' ')
+                        label = [int(c) for c in items[-1].split(',')]
+                        max_len = max(max_len, len(label))
+
+                    #print( "max len", max_len, i)
+                    max_length = max_len
+
+                    #mask = np.zeros( (batchsize, max_length)).astype('float32')
+
+                    for j in range(batchsize):
+                        line = img_label_lines[i * batchsize + j]
+                        items = line.split(' ')
+                        label = [int(c) for c in items[-1].split(',')]
+
+                        mask = np.zeros((max_len)).astype('float32')
+                        mask[:len(label) + 1] = 1.0
+                        #mask[ j, :len(label) + 1] = 1.0
+                        if max_length > len(label) + 1:
+                            extend_label = [EOS] * (max_length - len(label) - 1)
+                            label.extend(extend_label)
+                        else:
+                            label = label[0:max_length - 1]
+                        img = Image.open(os.path.join(img_root_dir, items[
+                            2])).convert('L')
+                        if j == 0:
+                            sz = img.size
+                        img = img.resize((sz[0], sz[1]))
+                        img = np.array(img) - 127.5
+                        img = img[np.newaxis, ...]
+                        result.append([img, [SOS] + label, label + [EOS], mask])
+                    yield result
+                if not cycle:
+                    break
+
+        return reader
+
+    def test_reader(self, img_root_dir, img_label_list):
+        '''
+        Reader interface for inference.
+
+        :param img_root_dir: The root path of the images for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for testing.
+        :type img_label_list: str
+        '''
+
+        def reader():
+            for line in open(img_label_list):
+                # h, w, img_name, labels
+                items = line.split(' ')
+
+                label = [int(c) for c in items[-1].split(',')]
+                img = Image.open(os.path.join(img_root_dir, items[2])).convert(
+                    'L')
+                img = np.array(img) - 127.5
+                img = img[np.newaxis, ...]
+                if self.model == "crnn_ctc":
+                    yield img, label
+                else:
+                    yield img, [SOS] + label, label + [EOS]
+
+        return reader
+
+    def infer_reader(self, img_root_dir=None, img_label_list=None, cycle=False):
+        '''A reader interface for inference.
+
+        :param img_root_dir: The root path of the images for training.
+        :type img_root_dir: str
+
+        :param img_label_list: The path of the <image_name, label> file for
+        inference. It should be the path of <image_path> file if img_root_dir
+        was None. If img_label_list was set to None, it will read image path
+        from stdin.
+        :type img_root_dir: str
+
+        :param cycle: If number of iterations is greater than dataset_size /
+        batch_size it reiterates dataset over as many times as necessary.
+        :type cycle: bool
+        '''
+
+        def reader():
+            def yield_img_and_label(lines):
+                for line in lines:
+                    if img_root_dir is not None:
+                        # h, w, img_name, labels
+                        img_name = line.split(' ')[2]
+                        img_path = os.path.join(img_root_dir, img_name)
+                    else:
+                        img_path = line.strip("\t\n\r")
+                    img = Image.open(img_path).convert('L')
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    label = [int(c) for c in line.split(' ')[3].split(',')]
+                    yield img, label
+
+            if img_label_list is not None:
+                lines = []
+                with open(img_label_list) as f:
+                    lines = f.readlines()
+                for img, label in yield_img_and_label(lines):
+                    yield img, label
+                while cycle:
+                    for img, label in yield_img_and_label(lines):
+                        yield img, label
+            else:
+                while True:
+                    img_path = input("Please input the path of image: ")
+                    img = Image.open(img_path).convert('L')
+                    img = np.array(img) - 127.5
+                    img = img[np.newaxis, ...]
+                    yield img, [[0]]
+
+        return reader
+
+
+def num_classes():
+    '''Get classes number of this dataset.
+    '''
+    return NUM_CLASSES
+
+
+def data_shape():
+    '''Get image shape of this dataset. It is a dummy shape for this dataset.
+    '''
+    return DATA_SHAPE
+
+
+def train(batch_size,
+          max_length,
+          train_images_dir=None,
+          train_list_file=None,
+          cycle=False,
+          shuffle=False,
+          model="crnn_ctc"):
+    generator = DataGenerator(model)
+    if train_images_dir is None:
+        data_dir = download_data()
+        train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
+    if train_list_file is None:
+        train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME)
+    return generator.train_reader(
+        train_images_dir,
+        train_list_file,
+        batch_size,
+        cycle,
+        max_length,
+        shuffle=shuffle)
+
+
+def test(batch_size=1,
+         test_images_dir=None,
+         test_list_file=None,
+         model="crnn_ctc"):
+    generator = DataGenerator(model)
+    if test_images_dir is None:
+        data_dir = download_data()
+        test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME)
+    if test_list_file is None:
+        test_list_file = path.join(data_dir, TEST_LIST_FILE_NAME)
+    return paddle.batch(
+        generator.test_reader(test_images_dir, test_list_file), batch_size)
+
+
+def inference(batch_size=1,
+              infer_images_dir=None,
+              infer_list_file=None,
+              cycle=False,
+              model="crnn_ctc"):
+    generator = DataGenerator(model)
+    return paddle.batch(
+        generator.infer_reader(infer_images_dir, infer_list_file, cycle),
+        batch_size)
+
+
+def download_data():
+    '''Download train and test data.
+    '''
+    tar_file = paddle.dataset.common.download(
+        DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME)
+    data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME)
+    if not path.isdir(data_dir):
+        t = tarfile.open(tar_file, "r:gz")
+        t.extractall(path=path.dirname(tar_file))
+        t.close()
+    return data_dir
diff --git a/dygraph/ocr_recognition/debug.sh b/dygraph/ocr_recognition/debug.sh
new file mode 100644
index 00000000..076a52aa
--- /dev/null
+++ b/dygraph/ocr_recognition/debug.sh
@@ -0,0 +1,4 @@
+
+export CUDA_VISIBLE_DEVICES=0
+
+python train.py
diff --git a/dygraph/ocr_recognition/train.py b/dygraph/ocr_recognition/train.py
new file mode 100644
index 00000000..954612af
--- /dev/null
+++ b/dygraph/ocr_recognition/train.py
@@ -0,0 +1,607 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import sys
+
+import numpy as np
+import paddle.fluid.profiler as profiler
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import data_reader
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit
+from paddle.fluid.dygraph.base import to_variable
+import argparse
+import functools
+from utility import add_arguments, print_arguments, get_attention_feeder_data
+import time
+
+from paddle.fluid import framework
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',        int,   32,         "Minibatch size.")
+add_arg('total_step',        int,   720000,    "The number of iterations. Zero or less means whole training set. More than 0 means the training set might be looped until # of iterations is reached.")
+add_arg('log_period',        int,   1000,       "Log period.")
+add_arg('save_model_period', int,   15000,      "Save model period. '-1' means never saving the model.")
+add_arg('eval_period',       int,   15000,      "Evaluate period. '-1' means never evaluating the model.")
+add_arg('save_model_dir',    str,   "./models", "The directory the model to be saved to.")
+add_arg('train_images',      str,   None,       "The directory of images to be used for training.")
+add_arg('train_list',        str,   None,       "The list file of images to be used for training.")
+add_arg('test_images',       str,   None,       "The directory of images to be used for test.")
+add_arg('test_list',         str,   None,       "The list file of images to be used for training.")
+add_arg('model',    str,   "attention",           "Which type of network to be used. 'crnn_ctc' or 'attention'")
+add_arg('init_model',        str,   None,       "The init model file of directory.")
+add_arg('use_gpu',           bool,  True,      "Whether use GPU to train.")
+add_arg('min_average_window',int,   10000,     "Min average window.")
+add_arg('max_average_window',int,   12500,     "Max average window. It is proposed to be set as the number of minibatch in a pass.")
+add_arg('average_window',    float, 0.15,      "Average window.")
+add_arg('parallel',          bool,  False,     "Whether use parallel training.")
+add_arg('profile',           bool,  False,      "Whether to use profiling.")
+add_arg('skip_batch_num',    int,   0,          "The number of first minibatches to skip as warm-up for better performance test.")
+add_arg('skip_test',         bool,  False,      "Whether to skip test phase.")
+
+
+class Config(object):
+    '''
+    config for training
+    '''
+    # decoder size for decoder stage
+    decoder_size = 128
+    # size for word embedding
+    word_vector_dim = 128
+    # max length for label padding
+    max_length = 100
+    gradient_clip = 10
+    LR = 1.0
+    beam_size = 2
+    learning_rate_decay = None
+
+    # batch size to train
+    batch_size = 32
+    # class number to classify
+    num_classes = 95
+
+    use_gpu = False
+    # special label for start and end
+    SOS = 0
+    EOS = 1
+    # settings for ctc data, not use in unittest
+    DATA_DIR_NAME = "./dataset/ctc_data/data"
+    TRAIN_DATA_DIR_NAME = "train_images"
+    TRAIN_LIST_FILE_NAME = "train.list"
+
+    # data shape for input image
+    DATA_SHAPE = [1, 48, 512]
+
+
+class ConvBNPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 group,
+                 out_ch,
+                 channels,
+                 act="relu",
+                 is_test=False,
+                 pool=True,
+                 use_cudnn=True):
+        super(ConvBNPool, self).__init__(name_scope)
+        self.group = group
+        self.pool = pool
+
+        filter_size = 3
+        conv_std_0 = (2.0 / (filter_size**2 * channels[0]))**0.5
+        conv_param_0 = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, conv_std_0))
+
+        conv_std_1 = (2.0 / (filter_size**2 * channels[1]))**0.5
+        conv_param_1 = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, conv_std_1))
+
+        self.conv_0_layer = Conv2D(
+            self.full_name(),
+            channels[0],
+            out_ch[0],
+            3,
+            padding=1,
+            param_attr=conv_param_0,
+            bias_attr=False,
+            act=None,
+            use_cudnn=use_cudnn)
+        self.bn_0_layer = BatchNorm(
+            self.full_name(), out_ch[0], act=act, is_test=is_test)
+        self.conv_1_layer = Conv2D(
+            self.full_name(),
+            num_channels=channels[1],
+            num_filters=out_ch[1],
+            filter_size=3,
+            padding=1,
+            param_attr=conv_param_1,
+            bias_attr=False,
+            act=None,
+            use_cudnn=use_cudnn)
+        self.bn_1_layer = BatchNorm(
+            self.full_name(), out_ch[1], act=act, is_test=is_test)
+
+        print( "pool", self.pool)
+        if self.pool:
+            self.pool_layer = Pool2D(
+                self.full_name(),
+                pool_size=2,
+                pool_type='max',
+                pool_stride=2,
+                use_cudnn=use_cudnn,
+                ceil_mode=True)
+
+    def forward(self, inputs):
+        conv_0 = self.conv_0_layer(inputs)
+        bn_0 = self.bn_0_layer(conv_0)
+        conv_1 = self.conv_1_layer(bn_0)
+        bn_1 = self.bn_1_layer(conv_1)
+        if self.pool:
+            bn_pool = self.pool_layer(bn_1)
+
+            return bn_pool
+        return bn_1
+
+
+class OCRConv(fluid.dygraph.Layer):
+    def __init__(self, name_scope, is_test=False, use_cudnn=True):
+        super(OCRConv, self).__init__(name_scope)
+        self.conv_bn_pool_1 = ConvBNPool(
+            self.full_name(),
+            2, [16, 16], [1, 16],
+            is_test=is_test,
+            use_cudnn=use_cudnn)
+        self.conv_bn_pool_2 = ConvBNPool(
+            self.full_name(),
+            2, [32, 32], [16, 32],
+            is_test=is_test,
+            use_cudnn=use_cudnn)
+        self.conv_bn_pool_3 = ConvBNPool(
+            self.full_name(),
+            2, [64, 64], [32, 64],
+            is_test=is_test,
+            use_cudnn=use_cudnn)
+        self.conv_bn_pool_4 = ConvBNPool(
+            self.full_name(),
+            2, [128, 128], [64, 128],
+            is_test=is_test,
+            pool=False,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        inputs_1 = self.conv_bn_pool_1(inputs)
+        inputs_2 = self.conv_bn_pool_2(inputs_1)
+        inputs_3 = self.conv_bn_pool_3(inputs_2)
+        inputs_4 = self.conv_bn_pool_4(inputs_3)
+
+        #print( inputs_4.numpy() )
+        return inputs_4
+
+
+class DynamicGRU(fluid.dygraph.Layer):
+    def __init__(self,
+                 scope_name,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 h_0=None,
+                 origin_mode=False,
+                 init_size = None):
+        super(DynamicGRU, self).__init__(scope_name)
+
+        self.gru_unit = GRUUnit(
+            self.full_name(),
+            size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+
+        self.size = size
+        self.h_0 = h_0
+        self.is_reverse = is_reverse
+
+
+    def forward(self, inputs):
+        hidden = self.h_0
+        res = []
+
+
+        for i in range(inputs.shape[1]):
+            if self.is_reverse:
+                i = inputs.shape[1] - 1 - i
+
+            input_ = inputs[ :, i:i+1, :]
+
+            input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False)
+            hidden, reset, gate = self.gru_unit(input_, hidden)
+
+            hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False)
+
+            res.append(hidden_)
+
+        if self.is_reverse:
+            res = res[::-1]
+        res = fluid.layers.concat(res, axis=1)
+        return res
+
+
+class EncoderNet(fluid.dygraph.Layer):
+    def __init__(self,
+                 scope_name,
+                 rnn_hidden_size=200,
+                 is_test=False,
+                 use_cudnn=True):
+        super(EncoderNet, self).__init__(scope_name)
+        self.rnn_hidden_size = rnn_hidden_size
+        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0,
+                                                                         0.02))
+        bias_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
+        if fluid.framework.in_dygraph_mode():
+            h_0 = np.zeros(
+                (Config.batch_size, rnn_hidden_size), dtype="float32")
+            h_0 = to_variable(h_0)
+        else:
+            h_0 = fluid.layers.fill_constant(
+                shape=[Config.batch_size, rnn_hidden_size],
+                dtype='float32',
+                value=0)
+        self.ocr_convs = OCRConv(
+            self.full_name(), is_test=is_test, use_cudnn=use_cudnn)
+
+        self.fc_1_layer = FC(self.full_name(),
+                             rnn_hidden_size * 3,
+                             param_attr=para_attr,
+                             bias_attr=False,
+                             num_flatten_dims=2)
+        self.fc_2_layer = FC(self.full_name(),
+                             rnn_hidden_size * 3,
+                             param_attr=para_attr,
+                             bias_attr=False,
+                             num_flatten_dims=2)
+        self.gru_forward_layer = DynamicGRU(
+            self.full_name(),
+            size=rnn_hidden_size,
+            h_0=h_0,
+            param_attr=para_attr,
+            bias_attr=bias_attr,
+            candidate_activation='relu')
+        self.gru_backward_layer = DynamicGRU(
+            self.full_name(),
+            size=rnn_hidden_size,
+            h_0=h_0,
+            param_attr=para_attr,
+            bias_attr=bias_attr,
+            candidate_activation='relu',
+            is_reverse=True)
+
+        self.encoded_proj_fc = FC(self.full_name(),
+                                  Config.decoder_size,
+                                  bias_attr=False,
+                                  num_flatten_dims=2)
+
+    def forward(self, inputs):
+        conv_features = self.ocr_convs(inputs)
+        transpose_conv_features = fluid.layers.transpose(conv_features, perm=[0,3,1,2])
+
+        sliced_feature = fluid.layers.reshape(
+            transpose_conv_features, [-1, transpose_conv_features.shape[1] , transpose_conv_features.shape[2]*transpose_conv_features.shape[3]], inplace=False)
+
+        fc_1 = self.fc_1_layer(sliced_feature)
+
+        fc_2 = self.fc_2_layer(sliced_feature)
+
+        gru_forward = self.gru_forward_layer(fc_1)
+
+        gru_backward = self.gru_backward_layer(fc_2)
+
+        encoded_vector = fluid.layers.concat(
+            input=[gru_forward, gru_backward], axis=2)
+
+        encoded_proj = self.encoded_proj_fc(encoded_vector)
+
+        return gru_backward, encoded_vector, encoded_proj
+
+
+class SimpleAttention(fluid.dygraph.Layer):
+    def __init__(self, scope_name, decoder_size):
+        super(SimpleAttention, self).__init__(scope_name)
+
+        self.fc_1 = FC(self.full_name(),
+                       decoder_size,
+                       act=None,
+                       bias_attr=False)
+        self.fc_2 = FC(self.full_name(),
+                       1,
+                       num_flatten_dims = 2,
+                       act=None,
+                       bias_attr=False)
+
+    def _build_once(self, encoder_vec, encoder_proj, decoder_state):
+        pass
+
+    def forward(self, encoder_vec, encoder_proj, decoder_state):
+
+        decoder_state_fc = self.fc_1(decoder_state)
+
+        decoder_state_proj_reshape = fluid.layers.reshape(
+            decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], inplace=False)
+        decoder_state_expand = fluid.layers.expand(
+            decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1])
+        concated = fluid.layers.elementwise_add(encoder_proj,
+                                                decoder_state_expand)
+        concated = fluid.layers.tanh(x=concated)
+        attention_weight = self.fc_2(concated)
+        weights_reshape = fluid.layers.reshape(
+            x=attention_weight, shape=[ concated.shape[0], -1], inplace=False)
+
+        weights_reshape = fluid.layers.softmax( weights_reshape )
+        scaled = fluid.layers.elementwise_mul(
+            x=encoder_vec, y=weights_reshape, axis=0)
+
+        context = fluid.layers.reduce_sum(scaled, dim=1)
+
+        return context
+
+
+class GRUDecoderWithAttention(fluid.dygraph.Layer):
+    def __init__(self, scope_name, decoder_size, num_classes):
+        super(GRUDecoderWithAttention, self).__init__(scope_name)
+        self.simple_attention = SimpleAttention(self.full_name(), decoder_size)
+
+        self.fc_1_layer = FC(self.full_name(),
+                             size=decoder_size * 3,
+                             bias_attr=False)
+        self.fc_2_layer = FC(self.full_name(),
+                             size=decoder_size * 3,
+                             bias_attr=False)
+        self.gru_unit = GRUUnit(
+            self.full_name(),
+            size=decoder_size * 3,
+            param_attr=None,
+            bias_attr=None)
+        self.out_layer = FC(self.full_name(),
+                            size=num_classes + 2,
+                            bias_attr=None,
+                            act='softmax')
+
+        self.decoder_size = decoder_size
+
+    def _build_once(self, target_embedding, encoder_vec, encoder_proj,
+                    decoder_boot):
+        pass
+
+    def forward(self, target_embedding, encoder_vec, encoder_proj,
+                decoder_boot):
+        res = []
+        hidden_mem = decoder_boot
+        for i in range(target_embedding.shape[1]):
+            current_word = fluid.layers.slice(
+                target_embedding, axes=[1], starts=[i], ends=[i + 1])
+            current_word = fluid.layers.reshape(
+                current_word, [-1, current_word.shape[2]], inplace=False)
+
+            context = self.simple_attention(encoder_vec, encoder_proj,
+                                            hidden_mem)
+            fc_1 = self.fc_1_layer(context)
+            fc_2 = self.fc_2_layer(current_word)
+            decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2)
+
+            h, _, _ = self.gru_unit(decoder_inputs, hidden_mem)
+            hidden_mem = h
+            out = self.out_layer(h)
+
+            res.append(out)
+
+
+        res1 = fluid.layers.concat(res, axis=1)
+
+        batch_size = target_embedding.shape[0]
+        seq_len = target_embedding.shape[1]
+        res1 = layers.reshape( res1, shape=[batch_size, seq_len, -1])
+
+        return res1
+
+
+class OCRAttention(fluid.dygraph.Layer):
+    def __init__(self, scope_name):
+        super(OCRAttention, self).__init__(scope_name)
+        self.encoder_net = EncoderNet(self.full_name())
+        self.fc = FC(self.full_name(),
+                     size=Config.decoder_size,
+                     bias_attr=False,
+                     act='relu')
+        self.embedding = Embedding(
+            self.full_name(), [Config.num_classes + 2, Config.word_vector_dim],
+            dtype='float32')
+        self.gru_decoder_with_attention = GRUDecoderWithAttention(
+            self.full_name(), Config.decoder_size, Config.num_classes)
+
+    def _build_once(self, inputs, label_in):
+        pass
+
+    def forward(self, inputs, label_in):
+        gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs)
+        backward_first = fluid.layers.slice(
+            gru_backward, axes=[1], starts=[0], ends=[1])
+        backward_first = fluid.layers.reshape(
+            backward_first, [-1, backward_first.shape[2]], inplace=False)
+
+        decoder_boot = self.fc(backward_first)
+
+        label_in = fluid.layers.reshape(label_in, [-1, 1], inplace=False)
+        trg_embedding = self.embedding(label_in)
+
+        trg_embedding = fluid.layers.reshape(
+            trg_embedding, [Config.batch_size, -1, trg_embedding.shape[1]],
+            inplace=False)
+
+        prediction = self.gru_decoder_with_attention(
+            trg_embedding, encoded_vector, encoded_proj, decoder_boot)
+
+        return prediction
+
+
+def train(args):
+
+    with fluid.dygraph.guard():
+        backward_strategy = fluid.dygraph.BackwardStrategy()
+        backward_strategy.sort_sum_gradient = True
+        ocr_attention = OCRAttention("ocr_attention")
+
+        if Config.learning_rate_decay == "piecewise_decay":
+            learning_rate = fluid.layers.piecewise_decay(
+                [50000], [Config.LR, Config.LR * 0.01])
+        else:
+            learning_rate = Config.LR
+        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+        dy_param_init_value = {}
+
+        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0 )
+
+        train_reader = data_reader.train(
+            Config.batch_size,
+            max_length=Config.max_length,
+            train_images_dir=args.train_images,
+            train_list_file=args.train_list,
+            cycle=args.total_step > 0,
+            shuffle=True,
+            model=args.model)
+
+        infer_image= './data/data/test_images/'
+        infer_files = './data/data/test.list'
+        test_reader = data_reader.train(
+                Config.batch_size,
+                1000,
+                train_images_dir= infer_image,
+                train_list_file= infer_files,
+                cycle=False,
+                model=args.model)
+        def eval():
+            ocr_attention.eval()
+            total_loss = 0.0
+            total_step = 0.0
+            equal_size = 0
+            for data in test_reader():
+                data_dict = get_attention_feeder_data(data)
+
+                label_in = to_variable(data_dict["label_in"])
+                label_out = to_variable(data_dict["label_out"])
+
+                label_out._stop_gradient = True
+                label_out.trainable = False
+
+                img = to_variable(data_dict["pixel"])
+
+                prediction = ocr_attention(img, label_in)
+                prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False)
+
+                score, topk = layers.topk( prediction, 1)
+
+                seq = topk.numpy()
+
+                seq = seq.reshape( ( args.batch_size, -1))
+
+                mask = data_dict['mask'].reshape( (args.batch_size, -1))
+                seq_len = np.sum( mask, -1)
+
+                trans_ref = data_dict["label_out"].reshape( (args.batch_size, -1))
+                for i in range( args.batch_size ):
+                    length = int(seq_len[i] -1 )
+                    trans = seq[i][:length - 1]
+                    ref = trans_ref[i][ : length - 1]
+                    if np.array_equal( trans, ref ):
+                        equal_size += 1
+
+                total_step += args.batch_size
+            print( "eval cost", equal_size / total_step )
+
+        total_step = 0
+        epoch_num = 20
+        for epoch in range(epoch_num):
+            batch_id = 0
+
+            total_loss = 0.0
+            for data in train_reader():
+
+                total_step += 1
+                data_dict = get_attention_feeder_data(data)
+
+                label_in = to_variable(data_dict["label_in"])
+                label_out = to_variable(data_dict["label_out"])
+
+                label_out._stop_gradient = True
+                label_out.trainable = False
+
+                img = to_variable(data_dict["pixel"])
+
+                prediction = ocr_attention(img, label_in)
+                prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False)
+                label_out = fluid.layers.reshape(label_out, [-1, 1], inplace=False)
+                loss = fluid.layers.cross_entropy(
+                    input=prediction, label=label_out)
+
+                mask = to_variable(data_dict["mask"])
+
+                loss = layers.elementwise_mul( loss, mask, axis=0)
+                avg_loss = fluid.layers.reduce_sum(loss)
+
+                total_loss += avg_loss.numpy()
+                avg_loss.backward()
+                optimizer.minimize(avg_loss, grad_clip=grad_clip)
+                ocr_attention.clear_gradients()
+
+                framework._dygraph_tracer()._clear_ops()
+
+                if batch_id > 0 and batch_id % 1000 == 0:
+                    print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, total_loss / args.batch_size / 1000))
+
+                    total_loss = 0.0
+
+                if total_step > 0 and total_step % 2000 == 0:
+
+                    model_value = ocr_attention.state_dict()
+                    np.savez( "model/" + str(total_step), **model_value )
+
+                    ocr_attention.eval()
+                    eval()
+                    ocr_attention.train()
+
+                batch_id +=1
+
+
+
+
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.profile:
+        if args.use_gpu:
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(args)
+        else:
+            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
+                train(args)
+    else:
+        train(args)
-- 
GitLab