From 89f00e53630af9711e6606f4effb935211790599 Mon Sep 17 00:00:00 2001 From: Hongyu Liu <43953930+phlrain@users.noreply.github.com> Date: Thu, 30 May 2019 11:24:34 +0800 Subject: [PATCH] Add dygraph ocr model (#2333) * add pbt lm; test=develop * add dynamic ocr recognition; test=develop --- dygraph/ocr_recognition/data_reader.py | 273 +++++++++++ dygraph/ocr_recognition/debug.sh | 4 + dygraph/ocr_recognition/train.py | 607 +++++++++++++++++++++++++ 3 files changed, 884 insertions(+) create mode 100644 dygraph/ocr_recognition/data_reader.py create mode 100644 dygraph/ocr_recognition/debug.sh create mode 100644 dygraph/ocr_recognition/train.py diff --git a/dygraph/ocr_recognition/data_reader.py b/dygraph/ocr_recognition/data_reader.py new file mode 100644 index 00000000..00e98d12 --- /dev/null +++ b/dygraph/ocr_recognition/data_reader.py @@ -0,0 +1,273 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import cv2 +import tarfile +import numpy as np +from PIL import Image +from os import path +from paddle.dataset.image import load_image +import paddle + +SOS = 0 +EOS = 1 +NUM_CLASSES = 95 +DATA_SHAPE = [1, 48, 512] + +DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5" +DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz" +CACHE_DIR_NAME = "ctc_data" +SAVED_FILE_NAME = "data.tar.gz" +DATA_DIR_NAME = "data" +TRAIN_DATA_DIR_NAME = "train_images" +TEST_DATA_DIR_NAME = "test_images" +TRAIN_LIST_FILE_NAME = "train.list" +TEST_LIST_FILE_NAME = "test.list" + + +class DataGenerator(object): + def __init__(self, model="crnn_ctc"): + self.model = model + + def train_reader(self, + img_root_dir, + img_label_list, + batchsize, + cycle, + max_length, + shuffle=True): + ''' + Reader interface for training. + + :param img_root_dir: The root path of the image for training. + :type img_root_dir: str + + :param img_label_list: The path of the file for training. + :type img_label_list: str + + :param cycle: If number of iterations is greater than dataset_size / batch_size + it reiterates dataset over as many times as necessary. + :type cycle: bool + + ''' + + img_label_lines = [] + to_file = "tmp.txt" + if not shuffle: + cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' > " + to_file + elif batchsize == 1: + cmd = "cat " + img_label_list + " | awk '{print $1,$2,$3,$4;}' | shuf > " + to_file + else: + #cmd1: partial shuffle + cmd = "cat " + img_label_list + " | awk '{printf(\"%04d%.4f %s\\n\", $1, rand(), $0)}' | sort | sed 1,$((1 + RANDOM % 100))d | " + #cmd2: batch merge and shuffle + cmd += "awk '{printf $2\" \"$3\" \"$4\" \"$5\" \"; if(NR % " + str( + batchsize) + " == 0) print \"\";}' | shuf | " + #cmd3: batch split + cmd += "awk '{if(NF == " + str( + batchsize + ) + " * 4) {for(i = 0; i < " + str( + batchsize + ) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file + os.system(cmd) + print("finish batch shuffle") + img_label_lines = open(to_file, 'r').readlines() + + def reader(): + sizes = len(img_label_lines) // batchsize + if sizes == 0: + raise ValueError('batchsize is bigger than the dataset size.') + while True: + for i in range(sizes): + result = [] + sz = [0, 0] + max_len = 0 + for k in range(batchsize): + line = img_label_lines[i * batchsize + k] + items = line.split(' ') + label = [int(c) for c in items[-1].split(',')] + max_len = max(max_len, len(label)) + + #print( "max len", max_len, i) + max_length = max_len + + #mask = np.zeros( (batchsize, max_length)).astype('float32') + + for j in range(batchsize): + line = img_label_lines[i * batchsize + j] + items = line.split(' ') + label = [int(c) for c in items[-1].split(',')] + + mask = np.zeros((max_len)).astype('float32') + mask[:len(label) + 1] = 1.0 + #mask[ j, :len(label) + 1] = 1.0 + if max_length > len(label) + 1: + extend_label = [EOS] * (max_length - len(label) - 1) + label.extend(extend_label) + else: + label = label[0:max_length - 1] + img = Image.open(os.path.join(img_root_dir, items[ + 2])).convert('L') + if j == 0: + sz = img.size + img = img.resize((sz[0], sz[1])) + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + result.append([img, [SOS] + label, label + [EOS], mask]) + yield result + if not cycle: + break + + return reader + + def test_reader(self, img_root_dir, img_label_list): + ''' + Reader interface for inference. + + :param img_root_dir: The root path of the images for training. + :type img_root_dir: str + + :param img_label_list: The path of the file for testing. + :type img_label_list: str + ''' + + def reader(): + for line in open(img_label_list): + # h, w, img_name, labels + items = line.split(' ') + + label = [int(c) for c in items[-1].split(',')] + img = Image.open(os.path.join(img_root_dir, items[2])).convert( + 'L') + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + if self.model == "crnn_ctc": + yield img, label + else: + yield img, [SOS] + label, label + [EOS] + + return reader + + def infer_reader(self, img_root_dir=None, img_label_list=None, cycle=False): + '''A reader interface for inference. + + :param img_root_dir: The root path of the images for training. + :type img_root_dir: str + + :param img_label_list: The path of the file for + inference. It should be the path of file if img_root_dir + was None. If img_label_list was set to None, it will read image path + from stdin. + :type img_root_dir: str + + :param cycle: If number of iterations is greater than dataset_size / + batch_size it reiterates dataset over as many times as necessary. + :type cycle: bool + ''' + + def reader(): + def yield_img_and_label(lines): + for line in lines: + if img_root_dir is not None: + # h, w, img_name, labels + img_name = line.split(' ')[2] + img_path = os.path.join(img_root_dir, img_name) + else: + img_path = line.strip("\t\n\r") + img = Image.open(img_path).convert('L') + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + label = [int(c) for c in line.split(' ')[3].split(',')] + yield img, label + + if img_label_list is not None: + lines = [] + with open(img_label_list) as f: + lines = f.readlines() + for img, label in yield_img_and_label(lines): + yield img, label + while cycle: + for img, label in yield_img_and_label(lines): + yield img, label + else: + while True: + img_path = input("Please input the path of image: ") + img = Image.open(img_path).convert('L') + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + yield img, [[0]] + + return reader + + +def num_classes(): + '''Get classes number of this dataset. + ''' + return NUM_CLASSES + + +def data_shape(): + '''Get image shape of this dataset. It is a dummy shape for this dataset. + ''' + return DATA_SHAPE + + +def train(batch_size, + max_length, + train_images_dir=None, + train_list_file=None, + cycle=False, + shuffle=False, + model="crnn_ctc"): + generator = DataGenerator(model) + if train_images_dir is None: + data_dir = download_data() + train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME) + if train_list_file is None: + train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME) + return generator.train_reader( + train_images_dir, + train_list_file, + batch_size, + cycle, + max_length, + shuffle=shuffle) + + +def test(batch_size=1, + test_images_dir=None, + test_list_file=None, + model="crnn_ctc"): + generator = DataGenerator(model) + if test_images_dir is None: + data_dir = download_data() + test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME) + if test_list_file is None: + test_list_file = path.join(data_dir, TEST_LIST_FILE_NAME) + return paddle.batch( + generator.test_reader(test_images_dir, test_list_file), batch_size) + + +def inference(batch_size=1, + infer_images_dir=None, + infer_list_file=None, + cycle=False, + model="crnn_ctc"): + generator = DataGenerator(model) + return paddle.batch( + generator.infer_reader(infer_images_dir, infer_list_file, cycle), + batch_size) + + +def download_data(): + '''Download train and test data. + ''' + tar_file = paddle.dataset.common.download( + DATA_URL, CACHE_DIR_NAME, DATA_MD5, save_name=SAVED_FILE_NAME) + data_dir = path.join(path.dirname(tar_file), DATA_DIR_NAME) + if not path.isdir(data_dir): + t = tarfile.open(tar_file, "r:gz") + t.extractall(path=path.dirname(tar_file)) + t.close() + return data_dir diff --git a/dygraph/ocr_recognition/debug.sh b/dygraph/ocr_recognition/debug.sh new file mode 100644 index 00000000..076a52aa --- /dev/null +++ b/dygraph/ocr_recognition/debug.sh @@ -0,0 +1,4 @@ + +export CUDA_VISIBLE_DEVICES=0 + +python train.py diff --git a/dygraph/ocr_recognition/train.py b/dygraph/ocr_recognition/train.py new file mode 100644 index 00000000..954612af --- /dev/null +++ b/dygraph/ocr_recognition/train.py @@ -0,0 +1,607 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import sys + +import numpy as np +import paddle.fluid.profiler as profiler +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import data_reader +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC, BatchNorm, Embedding, GRUUnit +from paddle.fluid.dygraph.base import to_variable +import argparse +import functools +from utility import add_arguments, print_arguments, get_attention_feeder_data +import time + +from paddle.fluid import framework + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('total_step', int, 720000, "The number of iterations. Zero or less means whole training set. More than 0 means the training set might be looped until # of iterations is reached.") +add_arg('log_period', int, 1000, "Log period.") +add_arg('save_model_period', int, 15000, "Save model period. '-1' means never saving the model.") +add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.") +add_arg('save_model_dir', str, "./models", "The directory the model to be saved to.") +add_arg('train_images', str, None, "The directory of images to be used for training.") +add_arg('train_list', str, None, "The list file of images to be used for training.") +add_arg('test_images', str, None, "The directory of images to be used for test.") +add_arg('test_list', str, None, "The list file of images to be used for training.") +add_arg('model', str, "attention", "Which type of network to be used. 'crnn_ctc' or 'attention'") +add_arg('init_model', str, None, "The init model file of directory.") +add_arg('use_gpu', bool, True, "Whether use GPU to train.") +add_arg('min_average_window',int, 10000, "Min average window.") +add_arg('max_average_window',int, 12500, "Max average window. It is proposed to be set as the number of minibatch in a pass.") +add_arg('average_window', float, 0.15, "Average window.") +add_arg('parallel', bool, False, "Whether use parallel training.") +add_arg('profile', bool, False, "Whether to use profiling.") +add_arg('skip_batch_num', int, 0, "The number of first minibatches to skip as warm-up for better performance test.") +add_arg('skip_test', bool, False, "Whether to skip test phase.") + + +class Config(object): + ''' + config for training + ''' + # decoder size for decoder stage + decoder_size = 128 + # size for word embedding + word_vector_dim = 128 + # max length for label padding + max_length = 100 + gradient_clip = 10 + LR = 1.0 + beam_size = 2 + learning_rate_decay = None + + # batch size to train + batch_size = 32 + # class number to classify + num_classes = 95 + + use_gpu = False + # special label for start and end + SOS = 0 + EOS = 1 + # settings for ctc data, not use in unittest + DATA_DIR_NAME = "./dataset/ctc_data/data" + TRAIN_DATA_DIR_NAME = "train_images" + TRAIN_LIST_FILE_NAME = "train.list" + + # data shape for input image + DATA_SHAPE = [1, 48, 512] + + +class ConvBNPool(fluid.dygraph.Layer): + def __init__(self, + name_scope, + group, + out_ch, + channels, + act="relu", + is_test=False, + pool=True, + use_cudnn=True): + super(ConvBNPool, self).__init__(name_scope) + self.group = group + self.pool = pool + + filter_size = 3 + conv_std_0 = (2.0 / (filter_size**2 * channels[0]))**0.5 + conv_param_0 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, conv_std_0)) + + conv_std_1 = (2.0 / (filter_size**2 * channels[1]))**0.5 + conv_param_1 = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, conv_std_1)) + + self.conv_0_layer = Conv2D( + self.full_name(), + channels[0], + out_ch[0], + 3, + padding=1, + param_attr=conv_param_0, + bias_attr=False, + act=None, + use_cudnn=use_cudnn) + self.bn_0_layer = BatchNorm( + self.full_name(), out_ch[0], act=act, is_test=is_test) + self.conv_1_layer = Conv2D( + self.full_name(), + num_channels=channels[1], + num_filters=out_ch[1], + filter_size=3, + padding=1, + param_attr=conv_param_1, + bias_attr=False, + act=None, + use_cudnn=use_cudnn) + self.bn_1_layer = BatchNorm( + self.full_name(), out_ch[1], act=act, is_test=is_test) + + print( "pool", self.pool) + if self.pool: + self.pool_layer = Pool2D( + self.full_name(), + pool_size=2, + pool_type='max', + pool_stride=2, + use_cudnn=use_cudnn, + ceil_mode=True) + + def forward(self, inputs): + conv_0 = self.conv_0_layer(inputs) + bn_0 = self.bn_0_layer(conv_0) + conv_1 = self.conv_1_layer(bn_0) + bn_1 = self.bn_1_layer(conv_1) + if self.pool: + bn_pool = self.pool_layer(bn_1) + + return bn_pool + return bn_1 + + +class OCRConv(fluid.dygraph.Layer): + def __init__(self, name_scope, is_test=False, use_cudnn=True): + super(OCRConv, self).__init__(name_scope) + self.conv_bn_pool_1 = ConvBNPool( + self.full_name(), + 2, [16, 16], [1, 16], + is_test=is_test, + use_cudnn=use_cudnn) + self.conv_bn_pool_2 = ConvBNPool( + self.full_name(), + 2, [32, 32], [16, 32], + is_test=is_test, + use_cudnn=use_cudnn) + self.conv_bn_pool_3 = ConvBNPool( + self.full_name(), + 2, [64, 64], [32, 64], + is_test=is_test, + use_cudnn=use_cudnn) + self.conv_bn_pool_4 = ConvBNPool( + self.full_name(), + 2, [128, 128], [64, 128], + is_test=is_test, + pool=False, + use_cudnn=use_cudnn) + + def forward(self, inputs): + inputs_1 = self.conv_bn_pool_1(inputs) + inputs_2 = self.conv_bn_pool_2(inputs_1) + inputs_3 = self.conv_bn_pool_3(inputs_2) + inputs_4 = self.conv_bn_pool_4(inputs_3) + + #print( inputs_4.numpy() ) + return inputs_4 + + +class DynamicGRU(fluid.dygraph.Layer): + def __init__(self, + scope_name, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None, + origin_mode=False, + init_size = None): + super(DynamicGRU, self).__init__(scope_name) + + self.gru_unit = GRUUnit( + self.full_name(), + size * 3, + param_attr=param_attr, + bias_attr=bias_attr, + activation=candidate_activation, + gate_activation=gate_activation, + origin_mode=origin_mode) + + self.size = size + self.h_0 = h_0 + self.is_reverse = is_reverse + + + def forward(self, inputs): + hidden = self.h_0 + res = [] + + + for i in range(inputs.shape[1]): + if self.is_reverse: + i = inputs.shape[1] - 1 - i + + input_ = inputs[ :, i:i+1, :] + + input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False) + hidden, reset, gate = self.gru_unit(input_, hidden) + + hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False) + + res.append(hidden_) + + if self.is_reverse: + res = res[::-1] + res = fluid.layers.concat(res, axis=1) + return res + + +class EncoderNet(fluid.dygraph.Layer): + def __init__(self, + scope_name, + rnn_hidden_size=200, + is_test=False, + use_cudnn=True): + super(EncoderNet, self).__init__(scope_name) + self.rnn_hidden_size = rnn_hidden_size + para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, + 0.02)) + bias_attr = fluid.ParamAttr( + initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0) + if fluid.framework.in_dygraph_mode(): + h_0 = np.zeros( + (Config.batch_size, rnn_hidden_size), dtype="float32") + h_0 = to_variable(h_0) + else: + h_0 = fluid.layers.fill_constant( + shape=[Config.batch_size, rnn_hidden_size], + dtype='float32', + value=0) + self.ocr_convs = OCRConv( + self.full_name(), is_test=is_test, use_cudnn=use_cudnn) + + self.fc_1_layer = FC(self.full_name(), + rnn_hidden_size * 3, + param_attr=para_attr, + bias_attr=False, + num_flatten_dims=2) + self.fc_2_layer = FC(self.full_name(), + rnn_hidden_size * 3, + param_attr=para_attr, + bias_attr=False, + num_flatten_dims=2) + self.gru_forward_layer = DynamicGRU( + self.full_name(), + size=rnn_hidden_size, + h_0=h_0, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu') + self.gru_backward_layer = DynamicGRU( + self.full_name(), + size=rnn_hidden_size, + h_0=h_0, + param_attr=para_attr, + bias_attr=bias_attr, + candidate_activation='relu', + is_reverse=True) + + self.encoded_proj_fc = FC(self.full_name(), + Config.decoder_size, + bias_attr=False, + num_flatten_dims=2) + + def forward(self, inputs): + conv_features = self.ocr_convs(inputs) + transpose_conv_features = fluid.layers.transpose(conv_features, perm=[0,3,1,2]) + + sliced_feature = fluid.layers.reshape( + transpose_conv_features, [-1, transpose_conv_features.shape[1] , transpose_conv_features.shape[2]*transpose_conv_features.shape[3]], inplace=False) + + fc_1 = self.fc_1_layer(sliced_feature) + + fc_2 = self.fc_2_layer(sliced_feature) + + gru_forward = self.gru_forward_layer(fc_1) + + gru_backward = self.gru_backward_layer(fc_2) + + encoded_vector = fluid.layers.concat( + input=[gru_forward, gru_backward], axis=2) + + encoded_proj = self.encoded_proj_fc(encoded_vector) + + return gru_backward, encoded_vector, encoded_proj + + +class SimpleAttention(fluid.dygraph.Layer): + def __init__(self, scope_name, decoder_size): + super(SimpleAttention, self).__init__(scope_name) + + self.fc_1 = FC(self.full_name(), + decoder_size, + act=None, + bias_attr=False) + self.fc_2 = FC(self.full_name(), + 1, + num_flatten_dims = 2, + act=None, + bias_attr=False) + + def _build_once(self, encoder_vec, encoder_proj, decoder_state): + pass + + def forward(self, encoder_vec, encoder_proj, decoder_state): + + decoder_state_fc = self.fc_1(decoder_state) + + decoder_state_proj_reshape = fluid.layers.reshape( + decoder_state_fc, [-1, 1, decoder_state_fc.shape[1]], inplace=False) + decoder_state_expand = fluid.layers.expand( + decoder_state_proj_reshape, [1, encoder_proj.shape[1], 1]) + concated = fluid.layers.elementwise_add(encoder_proj, + decoder_state_expand) + concated = fluid.layers.tanh(x=concated) + attention_weight = self.fc_2(concated) + weights_reshape = fluid.layers.reshape( + x=attention_weight, shape=[ concated.shape[0], -1], inplace=False) + + weights_reshape = fluid.layers.softmax( weights_reshape ) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weights_reshape, axis=0) + + context = fluid.layers.reduce_sum(scaled, dim=1) + + return context + + +class GRUDecoderWithAttention(fluid.dygraph.Layer): + def __init__(self, scope_name, decoder_size, num_classes): + super(GRUDecoderWithAttention, self).__init__(scope_name) + self.simple_attention = SimpleAttention(self.full_name(), decoder_size) + + self.fc_1_layer = FC(self.full_name(), + size=decoder_size * 3, + bias_attr=False) + self.fc_2_layer = FC(self.full_name(), + size=decoder_size * 3, + bias_attr=False) + self.gru_unit = GRUUnit( + self.full_name(), + size=decoder_size * 3, + param_attr=None, + bias_attr=None) + self.out_layer = FC(self.full_name(), + size=num_classes + 2, + bias_attr=None, + act='softmax') + + self.decoder_size = decoder_size + + def _build_once(self, target_embedding, encoder_vec, encoder_proj, + decoder_boot): + pass + + def forward(self, target_embedding, encoder_vec, encoder_proj, + decoder_boot): + res = [] + hidden_mem = decoder_boot + for i in range(target_embedding.shape[1]): + current_word = fluid.layers.slice( + target_embedding, axes=[1], starts=[i], ends=[i + 1]) + current_word = fluid.layers.reshape( + current_word, [-1, current_word.shape[2]], inplace=False) + + context = self.simple_attention(encoder_vec, encoder_proj, + hidden_mem) + fc_1 = self.fc_1_layer(context) + fc_2 = self.fc_2_layer(current_word) + decoder_inputs = fluid.layers.elementwise_add(x=fc_1, y=fc_2) + + h, _, _ = self.gru_unit(decoder_inputs, hidden_mem) + hidden_mem = h + out = self.out_layer(h) + + res.append(out) + + + res1 = fluid.layers.concat(res, axis=1) + + batch_size = target_embedding.shape[0] + seq_len = target_embedding.shape[1] + res1 = layers.reshape( res1, shape=[batch_size, seq_len, -1]) + + return res1 + + +class OCRAttention(fluid.dygraph.Layer): + def __init__(self, scope_name): + super(OCRAttention, self).__init__(scope_name) + self.encoder_net = EncoderNet(self.full_name()) + self.fc = FC(self.full_name(), + size=Config.decoder_size, + bias_attr=False, + act='relu') + self.embedding = Embedding( + self.full_name(), [Config.num_classes + 2, Config.word_vector_dim], + dtype='float32') + self.gru_decoder_with_attention = GRUDecoderWithAttention( + self.full_name(), Config.decoder_size, Config.num_classes) + + def _build_once(self, inputs, label_in): + pass + + def forward(self, inputs, label_in): + gru_backward, encoded_vector, encoded_proj = self.encoder_net(inputs) + backward_first = fluid.layers.slice( + gru_backward, axes=[1], starts=[0], ends=[1]) + backward_first = fluid.layers.reshape( + backward_first, [-1, backward_first.shape[2]], inplace=False) + + decoder_boot = self.fc(backward_first) + + label_in = fluid.layers.reshape(label_in, [-1, 1], inplace=False) + trg_embedding = self.embedding(label_in) + + trg_embedding = fluid.layers.reshape( + trg_embedding, [Config.batch_size, -1, trg_embedding.shape[1]], + inplace=False) + + prediction = self.gru_decoder_with_attention( + trg_embedding, encoded_vector, encoded_proj, decoder_boot) + + return prediction + + +def train(args): + + with fluid.dygraph.guard(): + backward_strategy = fluid.dygraph.BackwardStrategy() + backward_strategy.sort_sum_gradient = True + ocr_attention = OCRAttention("ocr_attention") + + if Config.learning_rate_decay == "piecewise_decay": + learning_rate = fluid.layers.piecewise_decay( + [50000], [Config.LR, Config.LR * 0.01]) + else: + learning_rate = Config.LR + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + dy_param_init_value = {} + + grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0 ) + + train_reader = data_reader.train( + Config.batch_size, + max_length=Config.max_length, + train_images_dir=args.train_images, + train_list_file=args.train_list, + cycle=args.total_step > 0, + shuffle=True, + model=args.model) + + infer_image= './data/data/test_images/' + infer_files = './data/data/test.list' + test_reader = data_reader.train( + Config.batch_size, + 1000, + train_images_dir= infer_image, + train_list_file= infer_files, + cycle=False, + model=args.model) + def eval(): + ocr_attention.eval() + total_loss = 0.0 + total_step = 0.0 + equal_size = 0 + for data in test_reader(): + data_dict = get_attention_feeder_data(data) + + label_in = to_variable(data_dict["label_in"]) + label_out = to_variable(data_dict["label_out"]) + + label_out._stop_gradient = True + label_out.trainable = False + + img = to_variable(data_dict["pixel"]) + + prediction = ocr_attention(img, label_in) + prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) + + score, topk = layers.topk( prediction, 1) + + seq = topk.numpy() + + seq = seq.reshape( ( args.batch_size, -1)) + + mask = data_dict['mask'].reshape( (args.batch_size, -1)) + seq_len = np.sum( mask, -1) + + trans_ref = data_dict["label_out"].reshape( (args.batch_size, -1)) + for i in range( args.batch_size ): + length = int(seq_len[i] -1 ) + trans = seq[i][:length - 1] + ref = trans_ref[i][ : length - 1] + if np.array_equal( trans, ref ): + equal_size += 1 + + total_step += args.batch_size + print( "eval cost", equal_size / total_step ) + + total_step = 0 + epoch_num = 20 + for epoch in range(epoch_num): + batch_id = 0 + + total_loss = 0.0 + for data in train_reader(): + + total_step += 1 + data_dict = get_attention_feeder_data(data) + + label_in = to_variable(data_dict["label_in"]) + label_out = to_variable(data_dict["label_out"]) + + label_out._stop_gradient = True + label_out.trainable = False + + img = to_variable(data_dict["pixel"]) + + prediction = ocr_attention(img, label_in) + prediction = fluid.layers.reshape( prediction, [label_out.shape[0] * label_out.shape[1], -1], inplace=False) + label_out = fluid.layers.reshape(label_out, [-1, 1], inplace=False) + loss = fluid.layers.cross_entropy( + input=prediction, label=label_out) + + mask = to_variable(data_dict["mask"]) + + loss = layers.elementwise_mul( loss, mask, axis=0) + avg_loss = fluid.layers.reduce_sum(loss) + + total_loss += avg_loss.numpy() + avg_loss.backward() + optimizer.minimize(avg_loss, grad_clip=grad_clip) + ocr_attention.clear_gradients() + + framework._dygraph_tracer()._clear_ops() + + if batch_id > 0 and batch_id % 1000 == 0: + print("epoch: {}, batch_id: {}, loss {}".format(epoch, batch_id, total_loss / args.batch_size / 1000)) + + total_loss = 0.0 + + if total_step > 0 and total_step % 2000 == 0: + + model_value = ocr_attention.state_dict() + np.savez( "model/" + str(total_step), **model_value ) + + ocr_attention.eval() + eval() + ocr_attention.train() + + batch_id +=1 + + + + + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + if args.profile: + if args.use_gpu: + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + train(args) + else: + with profiler.profiler("CPU", sorted_key='total') as cpuprof: + train(args) + else: + train(args) -- GitLab