diff --git a/fluid/ocr_recognition/README.md b/fluid/ocr_recognition/README.md index 7d35846fb1b67ce4fec7f364a22dce9cb853bb24..91aab78736cd77083e32c71e8e49253592c3248c 100644 --- a/fluid/ocr_recognition/README.md +++ b/fluid/ocr_recognition/README.md @@ -5,19 +5,19 @@ # Optical Character Recognition -这里将介绍如何在PaddlePaddle fluid下使用CRNN-CTC 和 CRNN-Attention模型对图片中的文字内容进行识别。 +这里将介绍如何在PaddlePaddle Fluid下使用CRNN-CTC 和 CRNN-Attention模型对图片中的文字内容进行识别。 ## 1. CRNN-CTC -本章的任务是识别含有单行汉语字符图片,首先采用卷积将图片转为`features map`, 然后使用`im2sequence op`将`features map`转为`sequence`,经过`双向GRU RNN`得到每个step的汉语字符的概率分布。训练过程选用的损失函数为CTC loss,最终的评估指标为`instance error rate`。 +本章的任务是识别含有单行汉语字符图片,首先采用卷积将图片转为特征图, 然后使用`im2sequence op`将特征图转为序列,通过`双向GRU`学习到序列特征。训练过程选用的损失函数为CTC(Connectionist Temporal Classification) loss,最终的评估指标为样本级别的错误率。 本路径下各个文件的作用如下: - **ctc_reader.py :** 下载、读取、处理数据。提供方法`train()` 和 `test()` 分别产生训练集和测试集的数据迭代器。 - **crnn_ctc_model.py :** 在该脚本中定义了训练网络、预测网络和evaluate网络。 - **ctc_train.py :** 用于模型的训练,可通过命令`python train.py --help` 获得使用方法。 -- **inference.py :** 加载训练好的模型文件,对新数据进行预测。可通过命令`python inference.py --help` 获得使用方法。 -- **eval.py :** 评估模型在指定数据集上的效果。可通过命令`python inference.py --help` 获得使用方法。 +- **infer.py :** 加载训练好的模型文件,对新数据进行预测。可通过命令`python infer.py --help` 获得使用方法。 +- **eval.py :** 评估模型在指定数据集上的效果。可通过命令`python infer.py --help` 获得使用方法。 - **utility.py :** 实现的一些通用方法,包括参数配置、tensor的构造等。 @@ -34,11 +34,11 @@ 图 1

-在训练集中,每张图片对应的label是由若干数字组成的sequence。 Sequence中的每个数字表示一个字符在字典中的index。 `图1` 对应的label如下所示: +在训练集中,每张图片对应的label是汉字在词典中的索引。 `图1` 对应的label如下所示: ``` 3835,8371,7191,2369,6876,4162,1938,168,1517,4590,3793 ``` -在上边这个label中,`3835` 表示字符‘两’的index,`4590` 表示中文字符逗号的index。 +在上边这个label中,`3835` 表示字符‘两’的索引,`4590` 表示中文字符逗号的索引。 #### 1.1.2 数据准备 @@ -122,7 +122,7 @@ env CUDA_VISIABLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True 执行`python ctc_train.py --help`可查看更多使用方式和参数详细说明。 -图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练pass数,纵轴为在测试集上的sequence_error. +图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。在45轮迭代训练中,测试集上最低错误率为第60轮的21.11%.


@@ -150,7 +150,7 @@ env CUDA_VISIBLE_DEVICE=0 python eval.py \ 从标准输入读取一张图片的路径,并对齐进行预测: ``` -env CUDA_VISIBLE_DEVICE=0 python inference.py \ +env CUDA_VISIBLE_DEVICE=0 python infer.py \ --model_path="models/model_00044_15000" ``` @@ -163,17 +163,17 @@ input_images_dir: None input_images_list: None model_path: /home/work/models/fluid/ocr_recognition/models/model_00052_15000 ------------------------------------------------ -Init model from: /home/work/models/fluid/ocr_recognition/models/model_00052_15000. -Please input the path of image: /home/work/models/fluid/ocr_recognition/data/test_images/00001_0060.jpg +Init model from: ./models/model_00052_15000. +Please input the path of image: ./test_images/00001_0060.jpg result: [3298 2371 4233 6514 2378 3298 2363] -Please input the path of image: /home/work/models/fluid/ocr_recognition/data/test_images/00001_0429.jpg +Please input the path of image: ./test_images/00001_0429.jpg result: [2067 2067 8187 8477 5027 7191 2431 1462] ``` 从文件中批量读取图片路径,并对其进行预测: ``` -env CUDA_VISIBLE_DEVICE=0 python inference.py \ +env CUDA_VISIBLE_DEVICE=0 python infer.py \ --model_path="models/model_00044_15000" \ --input_images_list="data/test.list" ``` diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py index df33100e36e25d871db25dd304e87053dfb77145..1e687d2aa53c0c43a7b491a61b60fd2432210c95 100644 --- a/fluid/ocr_recognition/crnn_ctc_model.py +++ b/fluid/ocr_recognition/crnn_ctc_model.py @@ -19,6 +19,7 @@ def conv_bn_pool(input, param_attr=param if param_0 is None else param_0, act=None, # LinearActivation use_cudnn=True) + #tmp = fluid.layers.Print(tmp) tmp = fluid.layers.batch_norm( input=tmp, act=act, @@ -139,65 +140,30 @@ def encoder_net(images, def ctc_train_net(images, label, args, num_classes): - regularizer = fluid.regularizer.L2Decay(args.l2) - gradient_clip = None - if args.parallel: - places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places, use_nccl=True) - with pd.do(): - images_ = pd.read_input(images) - label_ = pd.read_input(label) - - fc_out = encoder_net( - images_, - num_classes, - regularizer=regularizer, - gradient_clip=gradient_clip) - - cost = fluid.layers.warpctc( - input=fc_out, - label=label_, - blank=num_classes, - norm_by_times=True) - sum_cost = fluid.layers.reduce_sum(cost) - - decoded_out = fluid.layers.ctc_greedy_decoder( - input=fc_out, blank=num_classes) - - pd.write_output(sum_cost) - pd.write_output(decoded_out) - - sum_cost, decoded_out = pd() - sum_cost = fluid.layers.reduce_sum(sum_cost) - - else: - fc_out = encoder_net( - images, - num_classes, - regularizer=regularizer, - gradient_clip=gradient_clip) - - cost = fluid.layers.warpctc( - input=fc_out, label=label, blank=num_classes, norm_by_times=True) - sum_cost = fluid.layers.reduce_sum(cost) - decoded_out = fluid.layers.ctc_greedy_decoder( - input=fc_out, blank=num_classes) + L2_RATE = 0.0004 + LR = 1.0e-3 + MOMENTUM = 0.9 + regularizer = fluid.regularizer.L2Decay(L2_RATE) + fc_out = encoder_net(images, num_classes, regularizer=regularizer) + cost = fluid.layers.warpctc( + input=fc_out, label=label, blank=num_classes, norm_by_times=True) + sum_cost = fluid.layers.reduce_sum(cost) + decoded_out = fluid.layers.ctc_greedy_decoder( + input=fc_out, blank=num_classes) casted_label = fluid.layers.cast(x=label, dtype='int64') error_evaluator = fluid.evaluator.EditDistance( input=decoded_out, label=casted_label) - inference_program = fluid.default_main_program().clone(for_test=True) - - optimizer = fluid.optimizer.Momentum( - learning_rate=args.learning_rate, momentum=args.momentum) + optimizer = fluid.optimizer.Momentum(learning_rate=LR, momentum=MOMENTUM) _, params_grads = optimizer.minimize(sum_cost) - model_average = fluid.optimizer.ModelAverage( - args.average_window, - params_grads, - min_average_window=args.min_average_window, - max_average_window=args.max_average_window) - + model_average = None + if args.average_window > 0: + model_average = fluid.optimizer.ModelAverage( + args.average_window, + params_grads, + min_average_window=args.min_average_window, + max_average_window=args.max_average_window) return sum_cost, error_evaluator, inference_program, model_average diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py index 35db803506179d162226ae553fa25bfd4323d567..0351ba905b89139f0794d36ddea84c33ecc3722a 100644 --- a/fluid/ocr_recognition/ctc_train.py +++ b/fluid/ocr_recognition/ctc_train.py @@ -8,6 +8,7 @@ import functools import sys import time import os +import numpy as np parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -19,32 +20,23 @@ add_arg('save_model_period', int, 15000, "Save model period. '-1' means n add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.") add_arg('save_model_dir', str, "./models", "The directory the model to be saved to.") add_arg('init_model', str, None, "The init model file of directory.") -add_arg('learning_rate', float, 1.0e-3, "Learning rate.") -add_arg('l2', float, 0.0004, "L2 regularizer.") -add_arg('momentum', float, 0.9, "Momentum.") -add_arg('rnn_hidden_size', int, 200, "Hidden size of rnn layers.") add_arg('use_gpu', bool, True, "Whether use GPU to train.") add_arg('min_average_window',int, 10000, "Min average window.") add_arg('max_average_window',int, 15625, "Max average window. It is proposed to be set as the number of minibatch in a pass.") add_arg('average_window', float, 0.15, "Average window.") add_arg('parallel', bool, False, "Whether use parallel training.") -add_arg('train_images', str, None, "The directory of training images." - "None means using the default training images of reader.") -add_arg('train_list', str, None, "The list file of training images." - "None means using the default train_list file of reader.") -add_arg('test_images', str, None, "The directory of training images." - "None means using the default test images of reader.") -add_arg('test_list', str, None, "The list file of training images." - "None means using the default test_list file of reader.") -add_arg('num_classes', int, None, "The number of classes." - "None means using the default num_classes from reader.") # yapf: enable def train(args, data_reader=ctc_reader): """OCR CTC training""" + num_classes = None + train_images = None + train_list = None + test_images = None + test_list = None num_classes = data_reader.num_classes( - ) if args.num_classes is None else args.num_classes + ) if num_classes is None else num_classes data_shape = data_reader.data_shape() # define network images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') @@ -56,10 +48,10 @@ def train(args, data_reader=ctc_reader): # data reader train_reader = data_reader.train( args.batch_size, - train_images_dir=args.train_images, - train_list_file=args.train_list) + train_images_dir=train_images, + train_list_file=train_list) test_reader = data_reader.test( - test_images_dir=args.test_images, test_list_file=args.test_list) + test_images_dir=test_images, test_list_file=test_list) # prepare environment place = fluid.CPUPlace() @@ -78,45 +70,72 @@ def train(args, data_reader=ctc_reader): fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) print "Init model from: %s." % args.init_model - for pass_id in range(args.pass_num): + train_exe = exe + if args.parallel: + train_exe = fluid.ParallelExecutor( + use_cuda=True, loss_name=sum_cost.name) + + fetch_vars = [sum_cost] + error_evaluator.metrics + + def train_one_batch(data): + var_names = [var.name for var in fetch_vars] + if args.parallel: + results = train_exe.run(var_names, + feed_dict=get_feeder_data(data, place)) + results = [np.array(result).sum() for result in results] + else: + results = exe.run(feed=get_feeder_data(data, place), + fetch_list=fetch_vars) + results = [result[0] for result in results] + return results + + def test(pass_id, batch_id): error_evaluator.reset(exe) + for data in test_reader(): + exe.run(inference_program, feed=get_feeder_data(data, place)) + _, test_seq_error = error_evaluator.eval(exe) + print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % ( + time.time(), pass_id, batch_id, str(test_seq_error[0])) + + def save_model(args, exe, pass_id, batch_id): + filename = "model_%05d_%d" % (pass_id, batch_id) + fluid.io.save_params( + exe, dirname=args.save_model_dir, filename=filename) + print "Saved model to: %s/%s." % (args.save_model_dir, filename) + + error_evaluator.reset(exe) + for pass_id in range(args.pass_num): batch_id = 1 total_loss = 0.0 total_seq_error = 0.0 # train a pass for data in train_reader(): - batch_loss, _, batch_seq_error = exe.run( - fluid.default_main_program(), - feed=get_feeder_data(data, place), - fetch_list=[sum_cost] + error_evaluator.metrics) - total_loss += batch_loss[0] - total_seq_error += batch_seq_error[0] + results = train_one_batch(data) + total_loss += results[0] + total_seq_error += results[2] # training log if batch_id % args.log_period == 0: - print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % ( + print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq err: %s" % ( time.time(), pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size)) sys.stdout.flush() + # evaluate if batch_id % args.eval_period == 0: - with model_average.apply(exe): - error_evaluator.reset(exe) - for data in test_reader(): - exe.run(inference_program, - feed=get_feeder_data(data, place)) - _, test_seq_error = error_evaluator.eval(exe) - - print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % ( - time.time(), pass_id, batch_id, str(test_seq_error[0])) + if model_average: + with model_average.apply(exe): + test(pass_id, batch_id) + else: + test(pass_id, batch_d) + # save model if batch_id % args.save_model_period == 0: - with model_average.apply(exe): - filename = "model_%05d_%d" % (pass_id, batch_id) - fluid.io.save_params( - exe, dirname=args.save_model_dir, filename=filename) - print "Saved model to: %s/%s." % (args.save_model_dir, - filename) + if model_average: + with model_average.apply(exe): + save_model(args, exe, pass_id, batch_id) + else: + save_model(args, exe, pass_id, batch_id) batch_id += 1 diff --git a/fluid/ocr_recognition/images/train.jpg b/fluid/ocr_recognition/images/train.jpg index 3d691f1cd6b44c99c1b89286573daf1abd6dcbfa..ec86fb1bf828699b3b63926accad0e943f25feeb 100644 Binary files a/fluid/ocr_recognition/images/train.jpg and b/fluid/ocr_recognition/images/train.jpg differ diff --git a/fluid/ocr_recognition/inference.py b/fluid/ocr_recognition/infer.py similarity index 75% rename from fluid/ocr_recognition/inference.py rename to fluid/ocr_recognition/infer.py index 04175bb15d7834b76818b330763054e0a519e508..080e3f5f84efbb73e3c2381e809222fd2a90c416 100644 --- a/fluid/ocr_recognition/inference.py +++ b/fluid/ocr_recognition/infer.py @@ -14,6 +14,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) add_arg('model_path', str, None, "The model path to be used for inference.") add_arg('input_images_dir', str, None, "The directory of images.") add_arg('input_images_list', str, None, "The list file of images.") +add_arg('dict', str, None, "The dictionary. The result of inference will be index sequence if the dictionary was None.") add_arg('use_gpu', bool, True, "Whether use GPU to infer.") # yapf: enable @@ -31,12 +32,21 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader): infer_list_file=args.input_images_list) # prepare environment place = fluid.CPUPlace() - if use_gpu: + if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) + # load dictionary + dict_map = None + if args.dict is not None and os.path.isfile(args.dict): + dict_map = {} + with open(args.dict) as dict_file: + for i, word in enumerate(dict_file): + dict_map[i] = word.strip() + print "Loaded dict from %s" % args.dict + # load init model model_dir = args.model_path model_file_name = None @@ -52,7 +62,11 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader): data, place, need_label=False), fetch_list=[sequence], return_numpy=False) - print "result: %s" % (np.array(result[0]).flatten(), ) + indexes = np.array(result[0]).flatten() + if dict_map is not None: + print "result: %s" % ([dict_map[index] for index in indexes], ) + else: + print "result: %s" % (indexes, ) def main():