diff --git a/fluid/ocr_recognition/README.md b/fluid/ocr_recognition/README.md index 7d35846fb1b67ce4fec7f364a22dce9cb853bb24..91aab78736cd77083e32c71e8e49253592c3248c 100644 --- a/fluid/ocr_recognition/README.md +++ b/fluid/ocr_recognition/README.md @@ -5,19 +5,19 @@ # Optical Character Recognition -这里将介绍如何在PaddlePaddle fluid下使用CRNN-CTC 和 CRNN-Attention模型对图片中的文字内容进行识别。 +这里将介绍如何在PaddlePaddle Fluid下使用CRNN-CTC 和 CRNN-Attention模型对图片中的文字内容进行识别。 ## 1. CRNN-CTC -本章的任务是识别含有单行汉语字符图片,首先采用卷积将图片转为`features map`, 然后使用`im2sequence op`将`features map`转为`sequence`,经过`双向GRU RNN`得到每个step的汉语字符的概率分布。训练过程选用的损失函数为CTC loss,最终的评估指标为`instance error rate`。 +本章的任务是识别含有单行汉语字符图片,首先采用卷积将图片转为特征图, 然后使用`im2sequence op`将特征图转为序列,通过`双向GRU`学习到序列特征。训练过程选用的损失函数为CTC(Connectionist Temporal Classification) loss,最终的评估指标为样本级别的错误率。 本路径下各个文件的作用如下: - **ctc_reader.py :** 下载、读取、处理数据。提供方法`train()` 和 `test()` 分别产生训练集和测试集的数据迭代器。 - **crnn_ctc_model.py :** 在该脚本中定义了训练网络、预测网络和evaluate网络。 - **ctc_train.py :** 用于模型的训练,可通过命令`python train.py --help` 获得使用方法。 -- **inference.py :** 加载训练好的模型文件,对新数据进行预测。可通过命令`python inference.py --help` 获得使用方法。 -- **eval.py :** 评估模型在指定数据集上的效果。可通过命令`python inference.py --help` 获得使用方法。 +- **infer.py :** 加载训练好的模型文件,对新数据进行预测。可通过命令`python infer.py --help` 获得使用方法。 +- **eval.py :** 评估模型在指定数据集上的效果。可通过命令`python infer.py --help` 获得使用方法。 - **utility.py :** 实现的一些通用方法,包括参数配置、tensor的构造等。 @@ -34,11 +34,11 @@ 图 1
-在训练集中,每张图片对应的label是由若干数字组成的sequence。 Sequence中的每个数字表示一个字符在字典中的index。 `图1` 对应的label如下所示: +在训练集中,每张图片对应的label是汉字在词典中的索引。 `图1` 对应的label如下所示: ``` 3835,8371,7191,2369,6876,4162,1938,168,1517,4590,3793 ``` -在上边这个label中,`3835` 表示字符‘两’的index,`4590` 表示中文字符逗号的index。 +在上边这个label中,`3835` 表示字符‘两’的索引,`4590` 表示中文字符逗号的索引。 #### 1.1.2 数据准备 @@ -122,7 +122,7 @@ env CUDA_VISIABLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True 执行`python ctc_train.py --help`可查看更多使用方式和参数详细说明。 -图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练pass数,纵轴为在测试集上的sequence_error. +图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。在45轮迭代训练中,测试集上最低错误率为第60轮的21.11%.
@@ -150,7 +150,7 @@ env CUDA_VISIBLE_DEVICE=0 python eval.py \
从标准输入读取一张图片的路径,并对齐进行预测:
```
-env CUDA_VISIBLE_DEVICE=0 python inference.py \
+env CUDA_VISIBLE_DEVICE=0 python infer.py \
--model_path="models/model_00044_15000"
```
@@ -163,17 +163,17 @@ input_images_dir: None
input_images_list: None
model_path: /home/work/models/fluid/ocr_recognition/models/model_00052_15000
------------------------------------------------
-Init model from: /home/work/models/fluid/ocr_recognition/models/model_00052_15000.
-Please input the path of image: /home/work/models/fluid/ocr_recognition/data/test_images/00001_0060.jpg
+Init model from: ./models/model_00052_15000.
+Please input the path of image: ./test_images/00001_0060.jpg
result: [3298 2371 4233 6514 2378 3298 2363]
-Please input the path of image: /home/work/models/fluid/ocr_recognition/data/test_images/00001_0429.jpg
+Please input the path of image: ./test_images/00001_0429.jpg
result: [2067 2067 8187 8477 5027 7191 2431 1462]
```
从文件中批量读取图片路径,并对其进行预测:
```
-env CUDA_VISIBLE_DEVICE=0 python inference.py \
+env CUDA_VISIBLE_DEVICE=0 python infer.py \
--model_path="models/model_00044_15000" \
--input_images_list="data/test.list"
```
diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py
index df33100e36e25d871db25dd304e87053dfb77145..1e687d2aa53c0c43a7b491a61b60fd2432210c95 100644
--- a/fluid/ocr_recognition/crnn_ctc_model.py
+++ b/fluid/ocr_recognition/crnn_ctc_model.py
@@ -19,6 +19,7 @@ def conv_bn_pool(input,
param_attr=param if param_0 is None else param_0,
act=None, # LinearActivation
use_cudnn=True)
+ #tmp = fluid.layers.Print(tmp)
tmp = fluid.layers.batch_norm(
input=tmp,
act=act,
@@ -139,65 +140,30 @@ def encoder_net(images,
def ctc_train_net(images, label, args, num_classes):
- regularizer = fluid.regularizer.L2Decay(args.l2)
- gradient_clip = None
- if args.parallel:
- places = fluid.layers.get_places()
- pd = fluid.layers.ParallelDo(places, use_nccl=True)
- with pd.do():
- images_ = pd.read_input(images)
- label_ = pd.read_input(label)
-
- fc_out = encoder_net(
- images_,
- num_classes,
- regularizer=regularizer,
- gradient_clip=gradient_clip)
-
- cost = fluid.layers.warpctc(
- input=fc_out,
- label=label_,
- blank=num_classes,
- norm_by_times=True)
- sum_cost = fluid.layers.reduce_sum(cost)
-
- decoded_out = fluid.layers.ctc_greedy_decoder(
- input=fc_out, blank=num_classes)
-
- pd.write_output(sum_cost)
- pd.write_output(decoded_out)
-
- sum_cost, decoded_out = pd()
- sum_cost = fluid.layers.reduce_sum(sum_cost)
-
- else:
- fc_out = encoder_net(
- images,
- num_classes,
- regularizer=regularizer,
- gradient_clip=gradient_clip)
-
- cost = fluid.layers.warpctc(
- input=fc_out, label=label, blank=num_classes, norm_by_times=True)
- sum_cost = fluid.layers.reduce_sum(cost)
- decoded_out = fluid.layers.ctc_greedy_decoder(
- input=fc_out, blank=num_classes)
+ L2_RATE = 0.0004
+ LR = 1.0e-3
+ MOMENTUM = 0.9
+ regularizer = fluid.regularizer.L2Decay(L2_RATE)
+ fc_out = encoder_net(images, num_classes, regularizer=regularizer)
+ cost = fluid.layers.warpctc(
+ input=fc_out, label=label, blank=num_classes, norm_by_times=True)
+ sum_cost = fluid.layers.reduce_sum(cost)
+ decoded_out = fluid.layers.ctc_greedy_decoder(
+ input=fc_out, blank=num_classes)
casted_label = fluid.layers.cast(x=label, dtype='int64')
error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label)
-
inference_program = fluid.default_main_program().clone(for_test=True)
-
- optimizer = fluid.optimizer.Momentum(
- learning_rate=args.learning_rate, momentum=args.momentum)
+ optimizer = fluid.optimizer.Momentum(learning_rate=LR, momentum=MOMENTUM)
_, params_grads = optimizer.minimize(sum_cost)
- model_average = fluid.optimizer.ModelAverage(
- args.average_window,
- params_grads,
- min_average_window=args.min_average_window,
- max_average_window=args.max_average_window)
-
+ model_average = None
+ if args.average_window > 0:
+ model_average = fluid.optimizer.ModelAverage(
+ args.average_window,
+ params_grads,
+ min_average_window=args.min_average_window,
+ max_average_window=args.max_average_window)
return sum_cost, error_evaluator, inference_program, model_average
diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py
index 35db803506179d162226ae553fa25bfd4323d567..0351ba905b89139f0794d36ddea84c33ecc3722a 100644
--- a/fluid/ocr_recognition/ctc_train.py
+++ b/fluid/ocr_recognition/ctc_train.py
@@ -8,6 +8,7 @@ import functools
import sys
import time
import os
+import numpy as np
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
@@ -19,32 +20,23 @@ add_arg('save_model_period', int, 15000, "Save model period. '-1' means n
add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.")
add_arg('save_model_dir', str, "./models", "The directory the model to be saved to.")
add_arg('init_model', str, None, "The init model file of directory.")
-add_arg('learning_rate', float, 1.0e-3, "Learning rate.")
-add_arg('l2', float, 0.0004, "L2 regularizer.")
-add_arg('momentum', float, 0.9, "Momentum.")
-add_arg('rnn_hidden_size', int, 200, "Hidden size of rnn layers.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
add_arg('min_average_window',int, 10000, "Min average window.")
add_arg('max_average_window',int, 15625, "Max average window. It is proposed to be set as the number of minibatch in a pass.")
add_arg('average_window', float, 0.15, "Average window.")
add_arg('parallel', bool, False, "Whether use parallel training.")
-add_arg('train_images', str, None, "The directory of training images."
- "None means using the default training images of reader.")
-add_arg('train_list', str, None, "The list file of training images."
- "None means using the default train_list file of reader.")
-add_arg('test_images', str, None, "The directory of training images."
- "None means using the default test images of reader.")
-add_arg('test_list', str, None, "The list file of training images."
- "None means using the default test_list file of reader.")
-add_arg('num_classes', int, None, "The number of classes."
- "None means using the default num_classes from reader.")
# yapf: enable
def train(args, data_reader=ctc_reader):
"""OCR CTC training"""
+ num_classes = None
+ train_images = None
+ train_list = None
+ test_images = None
+ test_list = None
num_classes = data_reader.num_classes(
- ) if args.num_classes is None else args.num_classes
+ ) if num_classes is None else num_classes
data_shape = data_reader.data_shape()
# define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
@@ -56,10 +48,10 @@ def train(args, data_reader=ctc_reader):
# data reader
train_reader = data_reader.train(
args.batch_size,
- train_images_dir=args.train_images,
- train_list_file=args.train_list)
+ train_images_dir=train_images,
+ train_list_file=train_list)
test_reader = data_reader.test(
- test_images_dir=args.test_images, test_list_file=args.test_list)
+ test_images_dir=test_images, test_list_file=test_list)
# prepare environment
place = fluid.CPUPlace()
@@ -78,45 +70,72 @@ def train(args, data_reader=ctc_reader):
fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
print "Init model from: %s." % args.init_model
- for pass_id in range(args.pass_num):
+ train_exe = exe
+ if args.parallel:
+ train_exe = fluid.ParallelExecutor(
+ use_cuda=True, loss_name=sum_cost.name)
+
+ fetch_vars = [sum_cost] + error_evaluator.metrics
+
+ def train_one_batch(data):
+ var_names = [var.name for var in fetch_vars]
+ if args.parallel:
+ results = train_exe.run(var_names,
+ feed_dict=get_feeder_data(data, place))
+ results = [np.array(result).sum() for result in results]
+ else:
+ results = exe.run(feed=get_feeder_data(data, place),
+ fetch_list=fetch_vars)
+ results = [result[0] for result in results]
+ return results
+
+ def test(pass_id, batch_id):
error_evaluator.reset(exe)
+ for data in test_reader():
+ exe.run(inference_program, feed=get_feeder_data(data, place))
+ _, test_seq_error = error_evaluator.eval(exe)
+ print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % (
+ time.time(), pass_id, batch_id, str(test_seq_error[0]))
+
+ def save_model(args, exe, pass_id, batch_id):
+ filename = "model_%05d_%d" % (pass_id, batch_id)
+ fluid.io.save_params(
+ exe, dirname=args.save_model_dir, filename=filename)
+ print "Saved model to: %s/%s." % (args.save_model_dir, filename)
+
+ error_evaluator.reset(exe)
+ for pass_id in range(args.pass_num):
batch_id = 1
total_loss = 0.0
total_seq_error = 0.0
# train a pass
for data in train_reader():
- batch_loss, _, batch_seq_error = exe.run(
- fluid.default_main_program(),
- feed=get_feeder_data(data, place),
- fetch_list=[sum_cost] + error_evaluator.metrics)
- total_loss += batch_loss[0]
- total_seq_error += batch_seq_error[0]
+ results = train_one_batch(data)
+ total_loss += results[0]
+ total_seq_error += results[2]
# training log
if batch_id % args.log_period == 0:
- print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
+ print "\nTime: %s; Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq err: %s" % (
time.time(), pass_id, batch_id,
total_loss / (batch_id * args.batch_size),
total_seq_error / (batch_id * args.batch_size))
sys.stdout.flush()
+
# evaluate
if batch_id % args.eval_period == 0:
- with model_average.apply(exe):
- error_evaluator.reset(exe)
- for data in test_reader():
- exe.run(inference_program,
- feed=get_feeder_data(data, place))
- _, test_seq_error = error_evaluator.eval(exe)
-
- print "\nTime: %s; Pass[%d]-batch[%d]; Test seq error: %s.\n" % (
- time.time(), pass_id, batch_id, str(test_seq_error[0]))
+ if model_average:
+ with model_average.apply(exe):
+ test(pass_id, batch_id)
+ else:
+ test(pass_id, batch_d)
+
# save model
if batch_id % args.save_model_period == 0:
- with model_average.apply(exe):
- filename = "model_%05d_%d" % (pass_id, batch_id)
- fluid.io.save_params(
- exe, dirname=args.save_model_dir, filename=filename)
- print "Saved model to: %s/%s." % (args.save_model_dir,
- filename)
+ if model_average:
+ with model_average.apply(exe):
+ save_model(args, exe, pass_id, batch_id)
+ else:
+ save_model(args, exe, pass_id, batch_id)
batch_id += 1
diff --git a/fluid/ocr_recognition/images/train.jpg b/fluid/ocr_recognition/images/train.jpg
index 3d691f1cd6b44c99c1b89286573daf1abd6dcbfa..ec86fb1bf828699b3b63926accad0e943f25feeb 100644
Binary files a/fluid/ocr_recognition/images/train.jpg and b/fluid/ocr_recognition/images/train.jpg differ
diff --git a/fluid/ocr_recognition/inference.py b/fluid/ocr_recognition/infer.py
similarity index 75%
rename from fluid/ocr_recognition/inference.py
rename to fluid/ocr_recognition/infer.py
index 04175bb15d7834b76818b330763054e0a519e508..080e3f5f84efbb73e3c2381e809222fd2a90c416 100644
--- a/fluid/ocr_recognition/inference.py
+++ b/fluid/ocr_recognition/infer.py
@@ -14,6 +14,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
add_arg('model_path', str, None, "The model path to be used for inference.")
add_arg('input_images_dir', str, None, "The directory of images.")
add_arg('input_images_list', str, None, "The list file of images.")
+add_arg('dict', str, None, "The dictionary. The result of inference will be index sequence if the dictionary was None.")
add_arg('use_gpu', bool, True, "Whether use GPU to infer.")
# yapf: enable
@@ -31,12 +32,21 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
infer_list_file=args.input_images_list)
# prepare environment
place = fluid.CPUPlace()
- if use_gpu:
+ if args.use_gpu:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
+ # load dictionary
+ dict_map = None
+ if args.dict is not None and os.path.isfile(args.dict):
+ dict_map = {}
+ with open(args.dict) as dict_file:
+ for i, word in enumerate(dict_file):
+ dict_map[i] = word.strip()
+ print "Loaded dict from %s" % args.dict
+
# load init model
model_dir = args.model_path
model_file_name = None
@@ -52,7 +62,11 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
data, place, need_label=False),
fetch_list=[sequence],
return_numpy=False)
- print "result: %s" % (np.array(result[0]).flatten(), )
+ indexes = np.array(result[0]).flatten()
+ if dict_map is not None:
+ print "result: %s" % ([dict_map[index] for index in indexes], )
+ else:
+ print "result: %s" % (indexes, )
def main():