提交 6552fcf2 编写于 作者: R root

Merge branch 'develop' of https://github.com/PaddlePaddle/models into ce_image_classification2

......@@ -165,10 +165,10 @@ python widerface_eval.py --infer=True --confs_threshold=0.15
```
下图可视化了模型的预测结果:
<p align="center">
<img src="images/0_Parade_marchingband_1_356.jpg" height=300 width=400 hspace='10'/>
<img src="images/28_Sports_Fan_Sports_Fan_28_770.jpg" height=300 width=400 hspace='10'/>
<img src="images/4_Dancing_Dancing_4_194.jpg" height=300 width=400 hspace='10'/>
<img src="images/2_Demonstration_Demonstration_Or_Protest_2_58.jpg" height=300 width=400 hspace='10'/> <br />
<img src="images/0_Parade_marchingband_1_356.jpg" height=400 width=400 hspace='10'/>
<img src="images/28_Sports_Fan_Sports_Fan_28_770.jpg" height=400 width=400 hspace='10'/>
<img src="images/4_Dancing_Dancing_4_194.jpg" height=400 width=400 hspace='10'/>
<img src="images/2_Demonstration_Demonstration_Or_Protest_2_58.jpg" height=400 width=400 hspace='10'/> <br />
Pyramidbox 预测可视化
</p>
......
......@@ -2,6 +2,7 @@
# This file is only used for continuous evaluation.
export ce_mode=1
rm -rf *_factor.txt
python train.py --use_gpu=True 1> log
python train.py --use_gpu=True --random_mirror=False --random_scaling=False 1> log
cat log | python _ce.py
......@@ -7,8 +7,8 @@ from kpi import CostKpi, DurationKpi, AccKpi
# NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.05, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
tracking_kpis = [
train_cost_kpi,
......
"""Reader for Cityscape dataset.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import cv2
import numpy as np
......@@ -173,8 +176,8 @@ class DataGenerater:
"""
Scale label according to factor.
"""
h = label.shape[0] / factor
w = label.shape[1] / factor
h = label.shape[0] // factor
w = label.shape[1] // factor
return cv2.resize(
label, (h, w), interpolation=cv2.INTER_NEAREST)[:, :, np.newaxis]
......
......@@ -64,7 +64,7 @@ def eval(args):
exe.run(fluid.default_startup_program())
assert os.path.exists(args.model_path)
fluid.io.load_params(exe, args.model_path)
print "loaded model from: %s" % args.model_path
print("loaded model from: %s" % args.model_path)
sys.stdout.flush()
fetch_vars = [iou, out_w, out_r]
......@@ -80,11 +80,10 @@ def eval(args):
fetch_list=fetch_vars)
out_wrong += result[1]
out_right += result[2]
print "count: %s; current iou: %.3f;\r" % (count, result[0]),
sys.stdout.flush()
iou = cal_mean_iou(out_wrong, out_right)
print "\nmean iou: %.3f" % iou
print "kpis test_acc %f" % iou
print("\nmean iou: %.3f" % iou)
print("kpis test_acc %f" % iou)
def main():
......
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
import numpy as np
import sys
......@@ -20,8 +23,8 @@ def conv(input,
if padding == "SAME":
padding_h = max(k_h - s_h, 0)
padding_w = max(k_w - s_w, 0)
padding_top = padding_h / 2
padding_left = padding_w / 2
padding_top = padding_h // 2
padding_left = padding_w // 2
padding_bottom = padding_h - padding_top
padding_right = padding_w - padding_left
padding = [
......@@ -57,8 +60,8 @@ def atrous_conv(input,
if padding == "SAME":
padding_h = max(k_h - s_h, 0)
padding_w = max(k_w - s_w, 0)
padding_top = padding_h / 2
padding_left = padding_w / 2
padding_top = padding_h // 2
padding_left = padding_w // 2
padding_bottom = padding_h - padding_top
padding_right = padding_w - padding_left
padding = [
......@@ -141,15 +144,15 @@ def dilation_convs(input):
def pyramis_pooling(input, input_shape):
shape = np.ceil(input_shape / 32).astype("int32")
shape = np.ceil(input_shape // 32).astype("int32")
h, w = shape
pool1 = avg_pool(input, h, w, h, w)
pool1_interp = interp(pool1, shape)
pool2 = avg_pool(input, h / 2, w / 2, h / 2, w / 2)
pool2 = avg_pool(input, h // 2, w // 2, h // 2, w // 2)
pool2_interp = interp(pool2, shape)
pool3 = avg_pool(input, h / 3, w / 3, h / 3, w / 3)
pool3 = avg_pool(input, h // 3, w // 3, h // 3, w // 3)
pool3_interp = interp(pool3, shape)
pool4 = avg_pool(input, h / 4, w / 4, h / 4, w / 4)
pool4 = avg_pool(input, h // 4, w // 4, h // 4, w // 4)
pool4_interp = interp(pool4, shape)
conv5_3_sum = input + pool4_interp + pool3_interp + pool2_interp + pool1_interp
return conv5_3_sum
......@@ -172,14 +175,14 @@ def shared_convs(image):
def res_block(input, filter_num, padding=0, dilation=None, name=None):
tmp = conv(input, 1, 1, filter_num / 4, 1, 1, name=name + "_1_1_reduce")
tmp = conv(input, 1, 1, filter_num // 4, 1, 1, name=name + "_1_1_reduce")
tmp = bn(tmp, relu=True)
tmp = zero_padding(tmp, padding=padding)
if dilation is None:
tmp = conv(tmp, 3, 3, filter_num / 4, 1, 1, name=name + "_3_3")
tmp = conv(tmp, 3, 3, filter_num // 4, 1, 1, name=name + "_3_3")
else:
tmp = atrous_conv(
tmp, 3, 3, filter_num / 4, dilation, name=name + "_3_3")
tmp, 3, 3, filter_num // 4, dilation, name=name + "_3_3")
tmp = bn(tmp, relu=True)
tmp = conv(tmp, 1, 1, filter_num, 1, 1, name=name + "_1_1_increase")
tmp = bn(tmp, relu=False)
......@@ -195,7 +198,7 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1,
proj_bn = bn(proj, relu=False)
tmp = conv(
input, 1, 1, filter_num / 4, stride, stride, name=name + "_1_1_reduce")
input, 1, 1, filter_num // 4, stride, stride, name=name + "_1_1_reduce")
tmp = bn(tmp, relu=True)
tmp = zero_padding(tmp, padding=padding)
......@@ -208,7 +211,7 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1,
tmp,
3,
3,
filter_num / 4,
filter_num // 4,
1,
1,
padding=padding,
......@@ -218,7 +221,7 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1,
tmp,
3,
3,
filter_num / 4,
filter_num // 4,
dilation,
padding=padding,
name=name + "_3_3")
......@@ -232,12 +235,12 @@ def proj_block(input, filter_num, padding=0, dilation=None, stride=1,
def sub_net_4(input, input_shape):
tmp = interp(input, out_shape=np.ceil(input_shape / 32))
tmp = interp(input, out_shape=np.ceil(input_shape // 32))
tmp = dilation_convs(tmp)
tmp = pyramis_pooling(tmp, input_shape)
tmp = conv(tmp, 1, 1, 256, 1, 1, name="conv5_4_k1")
tmp = bn(tmp, relu=True)
tmp = interp(tmp, input_shape / 16)
tmp = interp(tmp, input_shape // 16)
return tmp
......@@ -265,7 +268,7 @@ def CCF24(sub2_out, sub4_out, input_shape):
tmp = bn(tmp, relu=False)
tmp = tmp + sub2_out
tmp = fluid.layers.relu(tmp)
tmp = interp(tmp, input_shape / 8)
tmp = interp(tmp, input_shape // 8)
return tmp
......@@ -275,7 +278,7 @@ def CCF124(sub1_out, sub24_out, input_shape):
tmp = bn(tmp, relu=False)
tmp = tmp + sub1_out
tmp = fluid.layers.relu(tmp)
tmp = interp(tmp, input_shape / 4)
tmp = interp(tmp, input_shape // 4)
return tmp
......
"""Infer for ICNet model."""
from __future__ import print_function
import cityscape
import argparse
import functools
......@@ -101,7 +102,7 @@ def infer(args):
exe.run(fluid.default_startup_program())
assert os.path.exists(args.model_path)
fluid.io.load_params(exe, args.model_path)
print "loaded model from: %s" % args.model_path
print("loaded model from: %s" % args.model_path)
sys.stdout.flush()
if not os.path.isdir(args.out_path):
......
"""Trainer for ICNet model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from icnet import icnet
import cityscape
import argparse
import functools
import sys
import os
import time
import paddle.fluid as fluid
import numpy as np
......@@ -11,9 +15,8 @@ from utils import add_arguments, print_arguments, get_feeder_data
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
SEED = 90
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
if 'ce_mode' in os.environ:
np.random.seed(10)
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......@@ -87,10 +90,14 @@ def train(args):
if args.use_gpu:
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
if 'ce_mode' in os.environ:
fluid.default_startup_program().random_seed = 90
exe.run(fluid.default_startup_program())
if args.init_model is not None:
print "load model from: %s" % args.init_model
print("load model from: %s" % args.init_model)
sys.stdout.flush()
fluid.io.load_params(exe, args.init_model)
......@@ -107,7 +114,7 @@ def train(args):
for data in train_reader():
if iter_id > TOTAL_STEP:
end_time = time.time()
print "kpis train_duration %f" % (end_time - start_time)
print("kpis train_duration %f" % (end_time - start_time))
return
iter_id += 1
results = exe.run(
......@@ -119,10 +126,10 @@ def train(args):
sub124_loss += results[3]
# training log
if iter_id % LOG_PERIOD == 0:
print "Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f" % (
print("Iter[%d]; train loss: %.3f; sub4_loss: %.3f; sub24_loss: %.3f; sub124_loss: %.3f" % (
iter_id, t_loss / LOG_PERIOD, sub4_loss / LOG_PERIOD,
sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD)
print "kpis train_cost %f" % (t_loss / LOG_PERIOD)
sub24_loss / LOG_PERIOD, sub124_loss / LOG_PERIOD))
print("kpis train_cost %f" % (t_loss / LOG_PERIOD))
t_loss = 0.
sub4_loss = 0.
......@@ -133,7 +140,7 @@ def train(args):
if iter_id % CHECKPOINT_PERIOD == 0 and args.checkpoint_path is not None:
dir_name = args.checkpoint_path + "/" + str(iter_id)
fluid.io.save_persistables(exe, dirname=dir_name)
print "Saved checkpoint: %s" % (dir_name)
print("Saved checkpoint: %s" % (dir_name))
def main():
......
......@@ -19,6 +19,7 @@ from __future__ import print_function
import distutils.util
import numpy as np
from paddle.fluid import core
import six
def print_arguments(args):
......@@ -37,7 +38,7 @@ def print_arguments(args):
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
......
......@@ -5,6 +5,6 @@ cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_epochs=10 --total_images=6149 --enable_ce=True | python _ce.py
cudaid=${object_detection_cudaid:=0, 1, 2, 3}
cudaid=${object_detection_cudaid_m:=0, 1, 2, 3}
export CUDA_VISIBLE_DEVICES=$cudaid
python train.py --batch_size=64 --num_epochs=10 --total_images=6149 --enable_ce=True | python _ce.py
......@@ -45,7 +45,7 @@ def calc_diff(f1, f2):
sq_df = np.mean(df * df)
return max_df, sq_df
except Exception as e:
return -1.0, -1.0
return 1.0, 1.0
def compare(path1, path2, no_exception):
......
......@@ -245,10 +245,18 @@ class Network(object):
@layer
def prelu(self, input, channel_shared, name):
#fluid = import_fluid()
#output = fluid.layers.relu(input)
#return output
raise NotImplementedError('prelu not implemented')
fluid = import_fluid()
if channel_shared:
mode = 'all'
else:
mode = 'channel'
prefix = name + '_'
output = fluid.layers.prelu(
input,
mode=mode,
param_attr=fluid.ParamAttr(name=prefix + 'negslope'))
return output
def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding,
name):
......
......@@ -176,6 +176,7 @@ class DataReshaper(object):
del node.reshaped_data
return graph
class CropFuser(object):
'''
Crop is to return a scalar output Blob for an input Blob of arbitrary size.
......@@ -197,7 +198,8 @@ class CropFuser(object):
cls._traced_names[fname] = []
cls._traced_names[fname].append(tname)
def __init__(self, allowed_parent_types=[NodeKind.Input, NodeKind.DummyData]):
def __init__(self,
allowed_parent_types=[NodeKind.Input, NodeKind.DummyData]):
self.allowed_parent_types = allowed_parent_types
def __call__(self, graph):
......@@ -232,7 +234,11 @@ class CropFuser(object):
def merge(self, parent, child):
'''Merge the parent node into the child.'''
child.metadata['shape'] = [parent.output_shape.batch_size, parent.output_shape.channels, parent.output_shape.height, parent.output_shape.width]
child.metadata['shape'] = [
parent.output_shape.batch_size, parent.output_shape.channels,
parent.output_shape.height, parent.output_shape.width
]
class SubNodeFuser(object):
'''
......@@ -395,6 +401,8 @@ class ParameterNamer(object):
names = ('scale', )
if getattr(node.parameters, 'bias_term', False):
names = ('scale', 'offset')
elif node.kind == NodeKind.PReLU:
names = ('negslope', )
elif node.kind == "Normalize":
names = ('scale', )
else:
......
......@@ -18,6 +18,7 @@ from __future__ import print_function
import numpy as np
import os
import six
import paddle
import paddle.fluid as fluid
......@@ -102,7 +103,7 @@ def infer():
init_recursive_seq_lens, place)
# Feed dict for inference
feed_dict = feeder.feed(map(lambda x: [x[0]], data))
feed_dict = feeder.feed([[x[0]] for x in data])
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
......@@ -115,7 +116,7 @@ def infer():
lod_level_1 = fetch_outs[0].lod()[1]
token_array = np.array(fetch_outs[0])
result = []
for i in xrange(len(lod_level_1) - 1):
for i in six.moves.xrange(len(lod_level_1) - 1):
sentence_list = [
trg_dict[token]
for token in token_array[lod_level_1[i]:lod_level_1[i + 1]]
......@@ -125,7 +126,7 @@ def infer():
lod_level_0 = fetch_outs[0].lod()[0]
paragraphs = [
result[lod_level_0[i]:lod_level_0[i + 1]]
for i in xrange(len(lod_level_0) - 1)
for i in six.moves.xrange(len(lod_level_0) - 1)
]
for paragraph in paragraphs:
......
......@@ -7,7 +7,7 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_card1_kpi = CostKpi('train_cost_card1', 0.01, 0, actived=True)
train_cost_card1_kpi = CostKpi('train_cost_card1', 0.02, 0, actived=True)
test_cost_card1_kpi = CostKpi('test_cost_card1', 0.005, 0, actived=True)
train_duration_card1_kpi = DurationKpi(
'train_duration_card1', 0.06, 0, actived=True)
......
......@@ -14,6 +14,6 @@ cudaid=${object_detection_cudaid:=0}
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py
cudaid=${object_detection_cudaid:=0,1,2,3}
cudaid=${object_detection_cudaid_m:=0,1,2,3}
export CUDA_VISIBLE_DEVICES=$cudaid
FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py
......@@ -8,8 +8,8 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True)
train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=False)
train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=False)
train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True)
train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True)
......
......@@ -22,6 +22,7 @@ import xml.etree.ElementTree
import os
import time
import copy
import six
class Settings(object):
......@@ -151,7 +152,7 @@ def preprocess(img, bbox_labels, mode, settings):
mirror = int(random.uniform(0, 2))
if mirror == 1:
img = img[:, ::-1, :]
for i in xrange(len(sampled_labels)):
for i in six.moves.xrange(len(sampled_labels)):
tmp = sampled_labels[i][1]
sampled_labels[i][1] = 1 - sampled_labels[i][3]
sampled_labels[i][3] = 1 - tmp
......
......@@ -65,7 +65,6 @@ def train(args,
name='gt_label', shape=[1], dtype='int32', lod_level=1)
difficult = fluid.layers.data(
name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=args.nms_threshold)
......@@ -88,16 +87,16 @@ def train(args,
if 'coco' in data_args.dataset:
# learning rate decay in 12, 19 pass, respectively
if '2014' in train_file_list:
epocs = 82783 / batch_size
epocs = 82783 // batch_size
boundaries = [epocs * 12, epocs * 19]
elif '2017' in train_file_list:
epocs = 118287 / batch_size
epocs = 118287 // batch_size
boundaries = [epocs * 12, epocs * 19]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25
]
elif 'pascalvoc' in data_args.dataset:
epocs = 19200 / batch_size
epocs = 19200 // batch_size
boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
values = [
learning_rate, learning_rate * 0.5, learning_rate * 0.25,
......@@ -126,6 +125,9 @@ def train(args,
train_reader = paddle.batch(
reader.train(data_args, train_file_list), batch_size=batch_size)
else:
import random
random.seed(0)
np.random.seed(0)
train_reader = paddle.batch(
reader.train(data_args, train_file_list, False), batch_size=batch_size)
test_reader = paddle.batch(
......@@ -137,7 +139,7 @@ def train(args,
model_path = os.path.join(model_save_dir, postfix)
if os.path.isdir(model_path):
shutil.rmtree(model_path)
print 'save models to %s' % (model_path)
print('save models to %s' % (model_path))
fluid.io.save_persistables(exe, model_path)
best_map = 0.
......@@ -166,8 +168,6 @@ def train(args,
start_time = time.time()
prev_start_time = start_time
every_pass_loss = []
iter = 0
pass_duration = 0.0
for batch_id, data in enumerate(train_reader()):
prev_start_time = start_time
start_time = time.time()
......@@ -193,15 +193,15 @@ def train(args,
total_time += end_time - start_time
train_avg_loss = np.mean(every_pass_loss)
if devices_num == 1:
print ("kpis train_cost %s" % train_avg_loss)
print ("kpis test_acc %s" % mean_map)
print ("kpis train_speed %s" % (total_time / epoch_idx))
print("kpis train_cost %s" % train_avg_loss)
print("kpis test_acc %s" % mean_map)
print("kpis train_speed %s" % (total_time / epoch_idx))
else:
print ("kpis train_cost_card%s %s" %
print("kpis train_cost_card%s %s" %
(devices_num, train_avg_loss))
print ("kpis test_acc_card%s %s" %
print("kpis test_acc_card%s %s" %
(devices_num, mean_map))
print ("kpis train_speed_card%s %f" %
print("kpis train_speed_card%s %f" %
(devices_num, total_time / epoch_idx))
......
......@@ -16,8 +16,10 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
import six
from paddle.fluid import core
......@@ -37,7 +39,7 @@ def print_arguments(args):
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
......
python ctc_train.py --batch_size=128 --total_step=10000 --eval_period=10000 --log_period=10000 --use_gpu=True
export ce_mode=1
rm *factor.txt
python ctc_train.py --batch_size=32 --total_step=30000 --eval_period=30000 --log_period=30000 --use_gpu=True 1> ./tmp.log
python train.py --batch_size=32 --total_step=1 --eval_period=1 --log_period=1 --use_gpu=True 1> ./tmp.log
cat tmp.log | python _ce.py
rm tmp.log
......@@ -5,8 +5,9 @@
## 代码结构
```
├── ctc_reader.py # 下载、读取、处理数据。
├── crnn_ctc_model.py # 定义了训练网络、预测网络和evaluate网络。
├── ctc_train.py # 用于模型的训练。
├── crnn_ctc_model.py # 定义了OCR CTC model的网络结构。
├── attention_model.py # 定义了OCR attention model的网络结构。
├── train.py # 用于模型的训练。
├── infer.py # 加载训练好的模型文件,对新数据进行预测。
├── eval.py # 评估模型在指定数据集上的效果。
└── utils.py # 定义通用的函数。
......@@ -15,9 +16,16 @@
## 简介
本章的任务是识别含有单行汉语字符图片,首先采用卷积将图片转为特征图, 然后使用`im2sequence op`将特征图转为序列,通过`双向GRU`学习到序列特征。训练过程选用的损失函数为CTC(Connectionist Temporal Classification) loss,最终的评估指标为样本级别的错误率
本章的任务是识别图片中单行英文字符,这里我们分别使用CTC model和attention model两种不同的模型来完成该任务
这两种模型的有相同的编码部分,首先采用卷积将图片转为特征图, 然后使用`im2sequence op`将特征图转为序列,通过`双向GRU`学习到序列特征。
两种模型的解码部分和使用的损失函数区别如下:
- CTC model: 训练过程选用的损失函数为CTC(Connectionist Temporal Classification) loss, 预测阶段采用的是贪婪策略和CTC解码策略。
- Attention model: 训练过程选用的是带注意力机制的解码策略和交叉信息熵损失函数,预测阶段采用的是柱搜索策略。
训练以上两种模型的评估指标为样本级别的错误率。
## 数据
......@@ -124,15 +132,23 @@ env OMP_NUM_THREADS=<num_of_physical_cores> python ctc_train.py --use_gpu False
env CUDA_VISIABLE_DEVICES=0,1,2,3 python ctc_train.py --parallel=True
```
默认使用的是`CTC model`, 可以通过选项`--model="attention"`切换为`attention model`
执行`python ctc_train.py --help`可查看更多使用方式和参数详细说明。
图2为使用默认参数和默认数据集训练的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。在60轮迭代训练中,测试集上最低错误率为第32轮的22.0%.
图2为使用默认参数在默认数据集上训练`CTC model`的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。测试集上最低错误率为22.0%.
<p align="center">
<img src="images/train.jpg" width="620" hspace='10'/> <br/>
<img src="images/train.jpg" width="400" hspace='10'/> <br/>
<strong>图 2</strong>
</p>
图3为使用默认参数在默认数据集上训练`attention model`的收敛曲线,其中横坐标轴为训练迭代次数,纵轴为样本级错误率。其中,蓝线为训练集上的样本错误率,红线为测试集上的样本错误率。测试集上最低错误率为16.25%.
<p align="center">
<img src="images/train_attention.jpg" width="400" hspace='10'/> <br/>
<strong>图 3</strong>
</p>
## 测试
......
......@@ -7,7 +7,7 @@ from kpi import CostKpi, DurationKpi, AccKpi
# NOTE kpi.py should shared in models in some way!!!!
train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.05, 0, actived=True)
test_acc_kpi = AccKpi('test_acc', 0.005, 0, actived=True)
train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
train_acc_kpi = AccKpi('train_acc', 0.005, 0, actived=True)
......
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
decoder_size = 128
word_vector_dim = 128
max_length = 100
sos = 0
eos = 1
gradient_clip = 10
LR = 1.0
beam_size = 2
learning_rate_decay = None
def conv_bn_pool(input,
group,
out_ch,
act="relu",
is_test=False,
pool=True,
use_cudnn=True):
tmp = input
for i in xrange(group):
filter_size = 3
conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
conv_param = fluid.ParamAttr(
initializer=fluid.initializer.Normal(0.0, conv_std))
tmp = fluid.layers.conv2d(
input=tmp,
num_filters=out_ch[i],
filter_size=3,
padding=1,
bias_attr=False,
param_attr=conv_param,
act=None, # LinearActivation
use_cudnn=use_cudnn)
tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
if pool == True:
tmp = fluid.layers.pool2d(
input=tmp,
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=use_cudnn,
ceil_mode=True)
return tmp
def ocr_convs(input, is_test=False, use_cudnn=True):
tmp = input
tmp = conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
tmp = conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
tmp = conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
tmp = conv_bn_pool(
tmp, 2, [128, 128], is_test=is_test, pool=False, use_cudnn=use_cudnn)
return tmp
def encoder_net(images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
conv_features = ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)
sliced_feature = fluid.layers.im2sequence(
input=conv_features,
stride=[1, 1],
filter_size=[conv_features.shape[2], 1])
para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
bias_attr = fluid.ParamAttr(
initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
fc_1 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False)
fc_2 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False)
gru_forward = fluid.layers.dynamic_gru(
input=fc_1,
size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
gru_backward = fluid.layers.dynamic_gru(
input=fc_2,
size=rnn_hidden_size,
is_reverse=True,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
encoded_vector = fluid.layers.concat(
input=[gru_forward, gru_backward], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False)
return gru_backward, encoded_vector, encoded_proj
def gru_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
decoder_boot, decoder_size, num_classes):
def simple_attention(encoder_vec, encoder_proj, decoder_state):
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = encoder_proj + decoder_state_expand
concated = fluid.layers.tanh(x=concated)
attention_weights = fluid.layers.fc(input=concated,
size=1,
act=None,
bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(
input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
rnn = fluid.layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
fc_1 = fluid.layers.fc(input=context,
size=decoder_size * 3,
bias_attr=False)
fc_2 = fluid.layers.fc(input=current_word,
size=decoder_size * 3,
bias_attr=False)
decoder_inputs = fc_1 + fc_2
h, _, _ = fluid.layers.gru_unit(
input=decoder_inputs, hidden=hidden_mem, size=decoder_size * 3)
rnn.update_memory(hidden_mem, h)
out = fluid.layers.fc(input=h,
size=num_classes + 2,
bias_attr=True,
act='softmax')
rnn.output(out)
return rnn()
def attention_train_net(args, data_shape, num_classes):
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label_in = fluid.layers.data(
name='label_in', shape=[1], dtype='int32', lod_level=1)
label_out = fluid.layers.data(
name='label_out', shape=[1], dtype='int32', lod_level=1)
gru_backward, encoded_vector, encoded_proj = encoder_net(images)
backward_first = fluid.layers.sequence_pool(
input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act="relu")
label_in = fluid.layers.cast(x=label_in, dtype='int64')
trg_embedding = fluid.layers.embedding(
input=label_in,
size=[num_classes + 2, word_vector_dim],
dtype='float32')
prediction = gru_decoder_with_attention(trg_embedding, encoded_vector,
encoded_proj, decoder_boot,
decoder_size, num_classes)
fluid.clip.set_gradient_clip(fluid.clip.GradientClipByValue(gradient_clip))
label_out = fluid.layers.cast(x=label_out, dtype='int64')
_, maxid = fluid.layers.topk(input=prediction, k=1)
error_evaluator = fluid.evaluator.EditDistance(
input=maxid, label=label_out, ignored_tokens=[sos, eos])
inference_program = fluid.default_main_program().clone(for_test=True)
cost = fluid.layers.cross_entropy(input=prediction, label=label_out)
sum_cost = fluid.layers.reduce_sum(cost)
if learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay([50000], [LR, LR * 0.01])
else:
learning_rate = LR
optimizer = fluid.optimizer.Adadelta(
learning_rate=learning_rate, epsilon=1.0e-6, rho=0.9)
optimizer.minimize(sum_cost)
model_average = None
if args.average_window > 0:
model_average = fluid.optimizer.ModelAverage(
args.average_window,
min_average_window=args.min_average_window,
max_average_window=args.max_average_window)
return sum_cost, error_evaluator, inference_program, model_average
def simple_attention(encoder_vec, encoder_proj, decoder_state, decoder_size):
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
concated = fluid.layers.tanh(x=concated)
attention_weights = fluid.layers.fc(input=concated,
size=1,
act=None,
bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
def attention_infer(images, num_classes, use_cudnn=True):
max_length = 20
gru_backward, encoded_vector, encoded_proj = encoder_net(
images, is_test=True, use_cudnn=use_cudnn)
backward_first = fluid.layers.sequence_pool(
input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act="relu")
init_state = decoder_boot
array_len = fluid.layers.fill_constant(
shape=[1], dtype='int64', value=max_length)
counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = fluid.layers.create_array('float32')
fluid.layers.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = fluid.layers.create_array('int64')
scores_array = fluid.layers.create_array('float32')
init_ids = fluid.layers.data(
name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = fluid.layers.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
fluid.layers.array_write(init_ids, array=ids_array, i=counter)
fluid.layers.array_write(init_scores, array=scores_array, i=counter)
cond = fluid.layers.less_than(x=counter, y=array_len)
while_op = fluid.layers.While(cond=cond)
with while_op.block():
pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
pre_state = fluid.layers.array_read(array=state_array, i=counter)
pre_score = fluid.layers.array_read(array=scores_array, i=counter)
pre_ids_emb = fluid.layers.embedding(
input=pre_ids,
size=[num_classes + 2, word_vector_dim],
dtype='float32')
context = simple_attention(encoded_vector, encoded_proj, pre_state,
decoder_size)
# expand the recursive_sequence_lengths of pre_state to be the same with pre_score
pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
context_expanded = fluid.layers.sequence_expand(context, pre_score)
fc_1 = fluid.layers.fc(input=context_expanded,
size=decoder_size * 3,
bias_attr=False)
fc_2 = fluid.layers.fc(input=pre_ids_emb,
size=decoder_size * 3,
bias_attr=False)
decoder_inputs = fc_1 + fc_2
current_state, _, _ = fluid.layers.gru_unit(
input=decoder_inputs,
hidden=pre_state_expanded,
size=decoder_size * 3)
current_state_with_lod = fluid.layers.lod_reset(
x=current_state, y=pre_score)
# use score to do beam search
current_score = fluid.layers.fc(input=current_state_with_lod,
size=num_classes + 2,
bias_attr=True,
act='softmax')
topk_scores, topk_indices = fluid.layers.topk(
current_score, k=beam_size)
# calculate accumulated scores after topk to reduce computation cost
accu_scores = fluid.layers.elementwise_add(
x=fluid.layers.log(topk_scores),
y=fluid.layers.reshape(
pre_score, shape=[-1]),
axis=0)
selected_ids, selected_scores = fluid.layers.beam_search(
pre_ids,
pre_score,
topk_indices,
accu_scores,
beam_size,
1, # end_id
#level=0
)
fluid.layers.increment(x=counter, value=1, in_place=True)
# update the memories
fluid.layers.array_write(current_state, array=state_array, i=counter)
fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
# update the break condition: up to the max length or all candidates of
# source sentences have ended.
length_cond = fluid.layers.less_than(x=counter, y=array_len)
finish_cond = fluid.layers.logical_not(
fluid.layers.is_empty(x=selected_ids))
fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
beam_size, eos)
return ids
def attention_eval(data_shape, num_classes):
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label_in = fluid.layers.data(
name='label_in', shape=[1], dtype='int32', lod_level=1)
label_out = fluid.layers.data(
name='label_out', shape=[1], dtype='int32', lod_level=1)
label_out = fluid.layers.cast(x=label_out, dtype='int64')
label_in = fluid.layers.cast(x=label_in, dtype='int64')
gru_backward, encoded_vector, encoded_proj = encoder_net(
images, is_test=True)
backward_first = fluid.layers.sequence_pool(
input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act="relu")
trg_embedding = fluid.layers.embedding(
input=label_in,
size=[num_classes + 2, word_vector_dim],
dtype='float32')
prediction = gru_decoder_with_attention(trg_embedding, encoded_vector,
encoded_proj, decoder_boot,
decoder_size, num_classes)
_, maxid = fluid.layers.topk(input=prediction, k=1)
error_evaluator = fluid.evaluator.EditDistance(
input=maxid, label=label_out, ignored_tokens=[sos, eos])
cost = fluid.layers.cross_entropy(input=prediction, label=label_out)
sum_cost = fluid.layers.reduce_sum(cost)
return error_evaluator, sum_cost
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
from paddle.fluid.initializer import init_on_cpu
import math
import six
def conv_bn_pool(input,
......@@ -15,7 +19,7 @@ def conv_bn_pool(input,
pooling=True,
use_cudnn=False):
tmp = input
for i in xrange(group):
for i in six.moves.xrange(group):
tmp = fluid.layers.conv2d(
input=tmp,
num_filters=out_ch[i],
......@@ -166,13 +170,16 @@ def encoder_net(images,
return fc_out
def ctc_train_net(images, label, args, num_classes):
def ctc_train_net(args, data_shape, num_classes):
L2_RATE = 0.0004
LR = 1.0e-3
MOMENTUM = 0.9
learning_rate_decay = None
regularizer = fluid.regularizer.L2Decay(L2_RATE)
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int32', lod_level=1)
fc_out = encoder_net(
images,
num_classes,
......@@ -189,7 +196,7 @@ def ctc_train_net(images, label, args, num_classes):
inference_program = fluid.default_main_program().clone(for_test=True)
if learning_rate_decay == "piecewise_decay":
learning_rate = fluid.layers.piecewise_decay([
args.total_step / 4, args.total_step / 2, args.total_step * 3 / 4
args.total_step // 4, args.total_step // 2, args.total_step * 3 // 4
], [LR, LR * 0.1, LR * 0.01, LR * 0.001])
else:
learning_rate = LR
......@@ -211,7 +218,10 @@ def ctc_infer(images, num_classes, use_cudnn):
return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)
def ctc_eval(images, label, num_classes, use_cudnn):
def ctc_eval(data_shape, num_classes, use_cudnn):
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int32', lod_level=1)
fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn)
decoded_out = fluid.layers.ctc_greedy_decoder(
input=fc_out, blank=num_classes)
......
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import cv2
import tarfile
import numpy as np
from PIL import Image
from os import path
from paddle.v2.image import load_image
import paddle.v2 as paddle
from paddle.dataset.image import load_image
import paddle
SOS = 0
EOS = 1
NUM_CLASSES = 95
DATA_SHAPE = [1, 48, 512]
......@@ -22,8 +27,8 @@ TEST_LIST_FILE_NAME = "test.list"
class DataGenerator(object):
def __init__(self):
pass
def __init__(self, model="crnn_ctc"):
self.model = model
def train_reader(self,
img_root_dir,
......@@ -65,11 +70,11 @@ class DataGenerator(object):
batchsize
) + "; i++) print $(4*i+1)\" \"$(4*i+2)\" \"$(4*i+3)\" \"$(4*i+4);}}' > " + to_file
os.system(cmd)
print "finish batch shuffle"
print("finish batch shuffle")
img_label_lines = open(to_file, 'r').readlines()
def reader():
sizes = len(img_label_lines) / batchsize
sizes = len(img_label_lines) // batchsize
if sizes == 0:
raise ValueError('Batch size is bigger than the dataset size.')
while True:
......@@ -89,7 +94,10 @@ class DataGenerator(object):
img = img.resize((sz[0], sz[1]))
img = np.array(img) - 127.5
img = img[np.newaxis, ...]
if self.model == "crnn_ctc":
result.append([img, label])
else:
result.append([img, [SOS] + label, label + [EOS]])
yield result
if not cycle:
break
......@@ -117,7 +125,10 @@ class DataGenerator(object):
'L')
img = np.array(img) - 127.5
img = img[np.newaxis, ...]
if self.model == "crnn_ctc":
yield img, label
else:
yield img, [SOS] + label, label + [EOS]
return reader
......@@ -185,8 +196,12 @@ def data_shape():
return DATA_SHAPE
def train(batch_size, train_images_dir=None, train_list_file=None, cycle=False):
generator = DataGenerator()
def train(batch_size,
train_images_dir=None,
train_list_file=None,
cycle=False,
model="crnn_ctc"):
generator = DataGenerator(model)
if train_images_dir is None:
data_dir = download_data()
train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
......@@ -199,8 +214,11 @@ def train(batch_size, train_images_dir=None, train_list_file=None, cycle=False):
train_images_dir, train_list_file, batch_size, cycle, shuffle=shuffle)
def test(batch_size=1, test_images_dir=None, test_list_file=None):
generator = DataGenerator()
def test(batch_size=1,
test_images_dir=None,
test_list_file=None,
model="crnn_ctc"):
generator = DataGenerator(model)
if test_images_dir is None:
data_dir = download_data()
test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME)
......@@ -213,8 +231,9 @@ def test(batch_size=1, test_images_dir=None, test_list_file=None):
def inference(batch_size=1,
infer_images_dir=None,
infer_list_file=None,
cycle=False):
generator = DataGenerator()
cycle=False,
model="crnn_ctc"):
generator = DataGenerator(model)
return paddle.batch(
generator.infer_reader(infer_images_dir, infer_list_file, cycle),
batch_size)
......
import paddle.v2 as paddle
import paddle.fluid as fluid
from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
from crnn_ctc_model import ctc_infer
from utility import add_arguments, print_arguments, to_lodtensor, get_ctc_feeder_data, get_attention_feeder_data
from attention_model import attention_eval
from crnn_ctc_model import ctc_eval
import ctc_reader
import data_reader
import argparse
import functools
import os
......@@ -11,27 +11,34 @@ import os
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('model_path', str, None, "The model path to be used for inference.")
add_arg('model', str, "crnn_ctc", "Which type of network to be used. 'crnn_ctc' or 'attention'")
add_arg('model_path', str, "", "The model path to be used for inference.")
add_arg('input_images_dir', str, None, "The directory of images.")
add_arg('input_images_list', str, None, "The list file of images.")
add_arg('use_gpu', bool, True, "Whether use GPU to eval.")
# yapf: enable
def evaluate(args, eval=ctc_eval, data_reader=ctc_reader):
def evaluate(args):
"""OCR inference"""
if args.model == "crnn_ctc":
eval = ctc_eval
get_feeder_data = get_ctc_feeder_data
else:
eval = attention_eval
get_feeder_data = get_attention_feeder_data
num_classes = data_reader.num_classes()
data_shape = data_reader.data_shape()
# define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int32', lod_level=1)
evaluator, cost = eval(images, label, num_classes)
evaluator, cost = eval(data_shape, num_classes)
# data reader
test_reader = data_reader.test(
test_images_dir=args.input_images_dir,
test_list_file=args.input_images_list)
test_list_file=args.input_images_list,
model=args.model)
# prepare environment
place = fluid.CPUPlace()
......@@ -48,7 +55,7 @@ def evaluate(args, eval=ctc_eval, data_reader=ctc_reader):
model_dir = os.path.dirname(args.model_path)
model_file_name = os.path.basename(args.model_path)
fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
print "Init model from: %s." % args.model_path
print("Init model from: %s." % args.model_path)
evaluator.reset(exe)
count = 0
......@@ -56,14 +63,14 @@ def evaluate(args, eval=ctc_eval, data_reader=ctc_reader):
count += 1
exe.run(fluid.default_main_program(), feed=get_feeder_data(data, place))
avg_distance, avg_seq_error = evaluator.eval(exe)
print "Read %d samples; avg_distance: %s; avg_seq_error: %s" % (
count, avg_distance, avg_seq_error)
print("Read %d samples; avg_distance: %s; avg_seq_error: %s" % (
count, avg_distance, avg_seq_error))
def main():
args = parser.parse_args()
print_arguments(args)
evaluate(args, data_reader=ctc_reader)
evaluate(args)
if __name__ == "__main__":
......
from __future__ import print_function
import paddle.v2 as paddle
import paddle.fluid as fluid
from utility import add_arguments, print_arguments, to_lodtensor, get_ctc_feeder_data, get_attention_feeder_for_infer
import paddle.fluid.profiler as profiler
from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
from crnn_ctc_model import ctc_infer
from attention_model import attention_infer
import numpy as np
import ctc_reader
import data_reader
import argparse
import functools
import os
......@@ -13,6 +15,7 @@ import time
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('model', str, "crnn_ctc", "Which type of network to be used. 'crnn_ctc' or 'attention'")
add_arg('model_path', str, None, "The model path to be used for inference.")
add_arg('input_images_dir', str, None, "The directory of images.")
add_arg('input_images_list', str, None, "The list file of images.")
......@@ -25,20 +28,28 @@ add_arg('batch_size', int, 1, "The minibatch size.")
# yapf: enable
def inference(args, infer=ctc_infer, data_reader=ctc_reader):
def inference(args):
"""OCR inference"""
if args.model == "crnn_ctc":
infer = ctc_infer
get_feeder_data = get_ctc_feeder_data
else:
infer = attention_infer
get_feeder_data = get_attention_feeder_for_infer
eos = 1
sos = 0
num_classes = data_reader.num_classes()
data_shape = data_reader.data_shape()
# define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
sequence = infer(
images, num_classes, use_cudnn=True if args.use_gpu else False)
ids = infer(images, num_classes, use_cudnn=True if args.use_gpu else False)
# data reader
infer_reader = data_reader.inference(
batch_size=args.batch_size,
infer_images_dir=args.input_images_dir,
infer_list_file=args.input_images_list,
cycle=True if args.iterations > 0 else False)
cycle=True if args.iterations > 0 else False,
model=args.model)
# prepare environment
place = fluid.CPUPlace()
if args.use_gpu:
......@@ -54,7 +65,7 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
with open(args.dict) as dict_file:
for i, word in enumerate(dict_file):
dict_map[i] = word.strip()
print "Loaded dict from %s" % args.dict
print("Loaded dict from %s" % args.dict)
# load init model
model_dir = args.model_path
......@@ -63,11 +74,12 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
model_dir = os.path.dirname(args.model_path)
model_file_name = os.path.basename(args.model_path)
fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
print "Init model from: %s." % args.model_path
print("Init model from: %s." % args.model_path)
batch_times = []
iters = 0
for data in infer_reader():
feed_dict = get_feeder_data(data, place)
if args.iterations > 0 and iters == args.iterations + args.skip_batch_num:
break
if iters < args.skip_batch_num:
......@@ -77,26 +89,25 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
start = time.time()
result = exe.run(fluid.default_main_program(),
feed=get_feeder_data(
data, place, need_label=False),
fetch_list=[sequence],
feed=feed_dict,
fetch_list=[ids],
return_numpy=False)
indexes = prune(np.array(result[0]).flatten(), 0, 1)
batch_time = time.time() - start
fps = args.batch_size / batch_time
batch_times.append(batch_time)
indexes = np.array(result[0]).flatten()
if dict_map is not None:
print "Iteration %d, latency: %.5f s, fps: %f, result: %s" % (
print("Iteration %d, latency: %.5f s, fps: %f, result: %s" % (
iters,
batch_time,
fps,
[dict_map[index] for index in indexes], )
[dict_map[index] for index in indexes], ))
else:
print "Iteration %d, latency: %.5f s, fps: %f, result: %s" % (
print("Iteration %d, latency: %.5f s, fps: %f, result: %s" % (
iters,
batch_time,
fps,
indexes, )
indexes, ))
iters += 1
......@@ -114,18 +125,29 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader):
print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
def prune(words, sos, eos):
"""Remove unused tokens in prediction result."""
start_index = 0
end_index = len(words)
if sos in words:
start_index = np.where(words == sos)[0][0] + 1
if eos in words:
end_index = np.where(words == eos)[0][0]
return words[start_index:end_index]
def main():
args = parser.parse_args()
print_arguments(args)
if args.profile:
if args.use_gpu:
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
inference(args, data_reader=ctc_reader)
inference(args)
else:
with profiler.profiler("CPU", sorted_key='total') as cpuprof:
inference(args, data_reader=ctc_reader)
inference(args)
else:
inference(args, data_reader=ctc_reader)
inference(args)
if __name__ == "__main__":
......
"""Trainer for OCR CTC model."""
"""Trainer for OCR CTC or attention model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from utility import add_arguments, print_arguments, to_lodtensor, get_ctc_feeder_data, get_attention_feeder_data
import paddle.fluid.profiler as profiler
from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
from crnn_ctc_model import ctc_train_net
import ctc_reader
from attention_model import attention_train_net
import data_reader
import argparse
import functools
import sys
......@@ -20,6 +24,7 @@ add_arg('log_period', int, 1000, "Log period.")
add_arg('save_model_period', int, 15000, "Save model period. '-1' means never saving the model.")
add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.")
add_arg('save_model_dir', str, "./models", "The directory the model to be saved to.")
add_arg('model', str, "crnn_ctc", "Which type of network to be used. 'crnn_ctc' or 'attention'")
add_arg('init_model', str, None, "The init model file of directory.")
add_arg('use_gpu', bool, True, "Whether use GPU to train.")
add_arg('min_average_window',int, 10000, "Min average window.")
......@@ -32,8 +37,16 @@ add_arg('skip_test', bool, False, "Whether to skip test phase.")
# yapf: enable
def train(args, data_reader=ctc_reader):
"""OCR CTC training"""
def train(args):
"""OCR training"""
if args.model == "crnn_ctc":
train_net = ctc_train_net
get_feeder_data = get_ctc_feeder_data
else:
train_net = attention_train_net
get_feeder_data = get_attention_feeder_data
num_classes = None
train_images = None
train_list = None
......@@ -43,20 +56,18 @@ def train(args, data_reader=ctc_reader):
) if num_classes is None else num_classes
data_shape = data_reader.data_shape()
# define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int32', lod_level=1)
sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(
images, label, args, num_classes)
sum_cost, error_evaluator, inference_program, model_average = train_net(
args, data_shape, num_classes)
# data reader
train_reader = data_reader.train(
args.batch_size,
train_images_dir=train_images,
train_list_file=train_list,
cycle=args.total_step > 0)
cycle=args.total_step > 0,
model=args.model)
test_reader = data_reader.test(
test_images_dir=test_images, test_list_file=test_list)
test_images_dir=test_images, test_list_file=test_list, model=args.model)
# prepare environment
place = fluid.CPUPlace()
......@@ -77,7 +88,7 @@ def train(args, data_reader=ctc_reader):
model_dir = os.path.dirname(args.init_model)
model_file_name = os.path.basename(args.init_model)
fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
print "Init model from: %s." % args.init_model
print("Init model from: %s." % args.init_model)
train_exe = exe
error_evaluator.reset(exe)
......@@ -104,18 +115,18 @@ def train(args, data_reader=ctc_reader):
for data in test_reader():
exe.run(inference_program, feed=get_feeder_data(data, place))
_, test_seq_error = error_evaluator.eval(exe)
print "\nTime: %s; Iter[%d]; Test seq error: %s.\n" % (
time.time(), iter_num, str(test_seq_error[0]))
print("\nTime: %s; Iter[%d]; Test seq error: %s.\n" % (
time.time(), iter_num, str(test_seq_error[0])))
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
print "kpis test_acc %f" % (1 - test_seq_error[0])
print("kpis test_acc %f" % (1 - test_seq_error[0]))
def save_model(args, exe, iter_num):
filename = "model_%05d" % iter_num
fluid.io.save_params(
exe, dirname=args.save_model_dir, filename=filename)
print "Saved model to: %s/%s." % (args.save_model_dir, filename)
print("Saved model to: %s/%s." % (args.save_model_dir, filename))
iter_num = 0
stop = False
......@@ -144,18 +155,18 @@ def train(args, data_reader=ctc_reader):
iter_num += 1
# training log
if iter_num % args.log_period == 0:
print "\nTime: %s; Iter[%d]; Avg Warp-CTC loss: %.3f; Avg seq err: %.3f" % (
print("\nTime: %s; Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" % (
time.time(), iter_num,
total_loss / (args.log_period * args.batch_size),
total_seq_error / (args.log_period * args.batch_size))
print "kpis train_cost %f" % (total_loss / (args.log_period *
args.batch_size))
print "kpis train_acc %f" % (
1 - total_seq_error / (args.log_period * args.batch_size))
total_seq_error / (args.log_period * args.batch_size)))
print("kpis train_cost %f" % (total_loss / (args.log_period *
args.batch_size)))
print("kpis train_acc %f" % (
1 - total_seq_error / (args.log_period * args.batch_size)))
total_loss = 0.0
total_seq_error = 0.0
# evaluate
# evaluate
if not args.skip_test and iter_num % args.eval_period == 0:
if model_average:
with model_average.apply(exe):
......@@ -171,7 +182,7 @@ def train(args, data_reader=ctc_reader):
else:
save_model(args, exe, iter_num)
end_time = time.time()
print "kpis train_duration %f" % (end_time - start_time)
print("kpis train_duration %f" % (end_time - start_time))
# Postprocess benchmark data
latencies = batch_times[args.skip_batch_num:]
latency_avg = np.average(latencies)
......@@ -195,12 +206,12 @@ def main():
if args.profile:
if args.use_gpu:
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
train(args, data_reader=ctc_reader)
train(args)
else:
with profiler.profiler("CPU", sorted_key='total') as cpuprof:
train(args, data_reader=ctc_reader)
train(args)
else:
train(args, data_reader=ctc_reader)
train(args)
if __name__ == "__main__":
......
......@@ -19,6 +19,8 @@ from __future__ import print_function
import distutils.util
import numpy as np
from paddle.fluid import core
import paddle.fluid as fluid
import six
def print_arguments(args):
......@@ -37,7 +39,7 @@ def print_arguments(args):
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
for arg, value in sorted(six.iteritems(vars(args))):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
......@@ -77,14 +79,58 @@ def to_lodtensor(data, place):
return res
def get_feeder_data(data, place, need_label=True):
def get_ctc_feeder_data(data, place, need_label=True):
pixel_tensor = core.LoDTensor()
pixel_data = None
pixel_data = np.concatenate(
map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32")
list(map(lambda x: x[0][np.newaxis, :], data)), axis=0).astype("float32")
pixel_tensor.set(pixel_data, place)
label_tensor = to_lodtensor(map(lambda x: x[1], data), place)
label_tensor = to_lodtensor(list(map(lambda x: x[1], data)), place)
if need_label:
return {"pixel": pixel_tensor, "label": label_tensor}
else:
return {"pixel": pixel_tensor}
def get_attention_feeder_data(data, place, need_label=True):
pixel_tensor = core.LoDTensor()
pixel_data = None
pixel_data = np.concatenate(
list(map(lambda x: x[0][np.newaxis, :], data)), axis=0).astype("float32")
pixel_tensor.set(pixel_data, place)
label_in_tensor = to_lodtensor(list(map(lambda x: x[1], data)), place)
label_out_tensor = to_lodtensor(list(map(lambda x: x[2], data)), place)
if need_label:
return {
"pixel": pixel_tensor,
"label_in": label_in_tensor,
"label_out": label_out_tensor
}
else:
return {"pixel": pixel_tensor}
def get_attention_feeder_for_infer(data, place):
batch_size = len(data)
init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_recursive_seq_lens = [1] * batch_size
init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
place)
init_scores = fluid.create_lod_tensor(init_scores_data,
init_recursive_seq_lens, place)
pixel_tensor = core.LoDTensor()
pixel_data = None
pixel_data = np.concatenate(
list(map(lambda x: x[0][np.newaxis, :], data)), axis=0).astype("float32")
pixel_tensor.set(pixel_data, place)
return {
"pixel": pixel_tensor,
"init_ids": init_ids,
"init_scores": init_scores
}
......@@ -7,8 +7,8 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_kpi = AccKpi('train_precision', 0.005, actived=True)
test_acc_kpi = CostKpi('test_precision', 0.005, actived=True)
train_acc_kpi = AccKpi('train_precision', 0.005, actived=False)
test_acc_kpi = CostKpi('test_precision', 0.005, actived=False)
train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
tracking_kpis = [
......
......@@ -8,7 +8,7 @@ from kpi import CostKpi, DurationKpi, AccKpi
#### NOTE kpi.py should shared in models in some way!!!!
train_acc_kpi = AccKpi('train_acc', 0.005, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.005, actived=True)
train_cost_kpi = CostKpi('train_cost', 0.005, actived=False)
train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
tracking_kpis = [
......
......@@ -3,6 +3,7 @@ import contextlib
import paddle
import paddle.fluid as fluid
import numpy as np
import six
import sys
import time
import os
......@@ -46,8 +47,8 @@ def data2tensor(data, place):
"""
data2tensor
"""
input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
input_seq = to_lodtensor([x[0] for x in data], place)
y_data = np.array([x[1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
return {"words": input_seq, "label": y_data}
......@@ -56,8 +57,8 @@ def data2pred(data, place):
"""
data2tensor
"""
input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
input_seq = to_lodtensor([x[0] for x in data], place)
y_data = np.array([x[1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
return {"words": input_seq}
......@@ -79,7 +80,7 @@ def save_dict(word_dict, vocab):
Save dict into file
"""
with open(vocab, "w") as fout:
for k, v in word_dict.iteritems():
for k, v in six.iteritems(word_dict):
outstr = ("%s\t%s\n" % (k, v)).encode("gb18030")
fout.write(outstr)
......@@ -424,7 +425,7 @@ def start_train(train_reader,
start_exe.run(fluid.default_startup_program())
exe = fluid.ParallelExecutor(use_cuda, loss_name=cost.name)
for pass_id in xrange(pass_num):
for pass_id in six.moves.xrange(pass_num):
total_acc, total_cost, total_count, avg_cost, avg_acc = 0.0, 0.0, 0.0, 0.0, 0.0
for data in train_reader():
cost_val, acc_val = exe.run(feed=feeder.feed(data),
......
......@@ -3,6 +3,7 @@ import contextlib
import paddle
import paddle.fluid as fluid
import numpy as np
import six
import sys
import time
import os
......@@ -46,8 +47,8 @@ def data2tensor(data, place):
"""
data2tensor
"""
input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
input_seq = to_lodtensor([x[0] for x in data], place)
y_data = np.array([x[1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
return {"words": input_seq, "label": y_data}
......@@ -56,8 +57,8 @@ def data2pred(data, place):
"""
data2tensor
"""
input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
input_seq = to_lodtensor([x[0] for x in data], place)
y_data = np.array([x[1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
return {"words": input_seq}
......@@ -79,7 +80,7 @@ def save_dict(word_dict, vocab):
Save dict into file
"""
with open(vocab, "w") as fout:
for k, v in word_dict.iteritems():
for k, v in six.iteritems(word_dict):
outstr = ("%s\t%s\n" % (k, v)).encode("gb18030")
fout.write(outstr)
......@@ -422,7 +423,7 @@ def start_train(train_reader,
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in xrange(pass_num):
for pass_id in six.moves.xrange(pass_num):
data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
for data in train_reader():
avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
......
import os
import six
import sys
import time
import unittest
......@@ -58,7 +59,7 @@ def train(train_reader,
if "CE_MODE_X" in os.environ:
fluid.default_startup_program().random_seed = 110
exe.run(fluid.default_startup_program())
for pass_id in xrange(pass_num):
for pass_id in six.moves.xrange(pass_num):
pass_start = time.time()
data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
for data in train_reader():
......
......@@ -43,8 +43,8 @@ def data2tensor(data, place):
"""
data2tensor
"""
input_seq = to_lodtensor(map(lambda x: x[0], data), place)
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
input_seq = to_lodtensor([x[0] for x in data], place)
y_data = np.array([x[1] for x in data]).astype("int64")
y_data = y_data.reshape([-1, 1])
return {"words": input_seq, "label": y_data}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册