提交 3deecd96 编写于 作者: W wanghaoshuang

Rfine ctc model

1. Move all network defining to 'crnn_ctc_model.py'
2. Add initilizer for some layers
3. Rename 'fluid/ocr' to 'fluid/ocr_recognition'
4. Remove copyright
5. Rename some functions
上级 744908f0
"""Trainer for OCR CTC model."""
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import numpy as np
import dummy_reader
import argparse
import functools
from paddle.v2.fluid import core
from utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 16, "Minibatch size.")
add_arg('pass_num', int, 16, "# of training epochs.")
add_arg('learning_rate', float, 1.0e-3, "Learning rate.")
add_arg('l2', float, 0.0005, "L2 regularizer.")
add_arg('max_clip', float, 10.0, "Max clip threshold.")
add_arg('min_clip', float, -10.0, "Min clip threshold.")
add_arg('momentum', float, 0.9, "Momentum.")
add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.")
add_arg('device', int, -1, "Device id.'-1' means running on CPU"
"while '0' means GPU-0.")
# yapf: disable
def _to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int32")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def _get_feeder_data(data, place):
pixel_tensor = core.LoDTensor()
pixel_data = np.concatenate(
map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32")
pixel_tensor.set(pixel_data, place)
label_tensor = _to_lodtensor(map(lambda x: x[1], data), place)
return {"pixel": pixel_tensor, "label": label_tensor}
def _ocr_conv(input, num, with_bn, param_attrs):
assert (num % 4 == 0)
def _conv_block(input, filter_size, group_size, with_bn):
return fluid.nets.img_conv_group(
input=input,
conv_num_filter=[filter_size] * group_size,
pool_size=2,
pool_stride=2,
conv_padding=1,
conv_filter_size=3,
conv_act='relu',
conv_with_batchnorm=with_bn,
pool_type='max',
param_attr=param_attrs)
conv1 = _conv_block(input, 16, (num / 4), with_bn)
conv2 = _conv_block(conv1, 32, (num / 4), with_bn)
conv3 = _conv_block(conv2, 64, (num / 4), with_bn)
conv4 = _conv_block(conv3, 128, (num / 4), with_bn)
return conv4
def _ocr_ctc_net(images, num_classes, param_attrs, rnn_hidden_size=200):
conv_features = _ocr_conv(images, 8, True, param_attrs)
sliced_feature = fluid.layers.im2sequence(
input=conv_features, stride=[1, 1], filter_size=[conv_features.shape[2], 1])
hidden_size = rnn_hidden_size
fc_1 = fluid.layers.fc(input=sliced_feature, size=hidden_size * 3, param_attr=param_attrs)
fc_2 = fluid.layers.fc(input=sliced_feature, size=hidden_size * 3, param_attr=param_attrs)
gru_forward = fluid.layers.dynamic_gru(
input=fc_1, size=hidden_size, param_attr=param_attrs)
gru_backward = fluid.layers.dynamic_gru(
input=fc_2, size=hidden_size, is_reverse=True, param_attr=param_attrs)
fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
size=num_classes + 1,
param_attr=param_attrs)
return fc_out
def train(args, data_reader=dummy_reader):
"""OCR CTC training"""
num_classes = data_reader.num_classes()
# define network
param_attrs = fluid.ParamAttr(
regularizer=fluid.regularizer.L2Decay(args.l2),
gradient_clip=fluid.clip.GradientClipByValue(args.max_clip, args.min_clip))
data_shape = data_reader.data_shape()
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(
name='label', shape=[1], dtype='int32', lod_level=1)
fc_out = _ocr_ctc_net(images, num_classes, param_attrs, rnn_hidden_size=args.rnn_hidden_size)
# define cost and optimizer
cost = fluid.layers.warpctc(
input=fc_out,
label=label,
size=num_classes + 1,
blank=num_classes,
norm_by_times=True)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Momentum(
learning_rate=args.learning_rate, momentum=args.momentum)
optimizer.minimize(avg_cost)
# decoder and evaluator
decoded_out = fluid.layers.ctc_greedy_decoder(
input=fc_out, blank=num_classes)
casted_label = fluid.layers.cast(x=label, dtype='int64')
error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label)
# data reader
train_reader = paddle.batch(data_reader.train(), batch_size=args.batch_size)
test_reader = paddle.batch(data_reader.test(), batch_size=args.batch_size)
# prepare environment
place = fluid.CPUPlace()
if args.device >= 0:
place = fluid.CUDAPlace(args.device)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
inference_program = fluid.io.get_inference_program(error_evaluator)
for pass_id in range(args.pass_num):
error_evaluator.reset(exe)
batch_id = 0
# train a pass
for data in train_reader():
loss, batch_edit_distance = exe.run(
fluid.default_main_program(),
feed=_get_feeder_data(data, place),
fetch_list=[avg_cost] + error_evaluator.metrics)
print "Pass[%d]-batch[%d]; Loss: %s; Word error: %s." % (
pass_id, batch_id, loss[0], batch_edit_distance[0])
batch_id += 1
train_edit_distance = error_evaluator.eval(exe)
print "End pass[%d]; Train word error: %s." % (
pass_id, str(train_edit_distance[0]))
# evaluate model on test data
error_evaluator.reset(exe)
for data in test_reader():
exe.run(inference_program, feed=_get_feeder_data(data, place))
test_edit_distance = error_evaluator.eval(exe)
print "End pass[%d]; Test word error: %s." % (
pass_id, str(test_edit_distance[0]))
def main():
args = parser.parse_args()
print_arguments(args)
train(args)
if __name__ == "__main__":
main()
import paddle.v2.fluid as fluid
def conv_bn_pool(input,
group,
out_ch,
act="relu",
param=None,
bias=None,
param_0=None):
tmp = input
for i in xrange(group):
tmp = fluid.layers.conv2d(
input=tmp,
num_filters=out_ch[i],
filter_size=3,
padding=1,
param_attr=param if param_0 is None else param_0,
act=None, # LinearActivation
use_cudnn=True)
tmp = fluid.layers.batch_norm(
input=tmp, act=act, param_attr=param, bias_attr=bias)
tmp = fluid.layers.pool2d(
input=tmp, pool_size=2, pool_type='max', pool_stride=2, use_cudnn=True)
return tmp
def ocr_convs(input, num, with_bn, regularizer=None, gradient_clip=None):
assert (num % 4 == 0)
b = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.0))
w0 = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.0005))
w1 = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.01))
tmp = input
tmp = conv_bn_pool(tmp, 2, [16, 16], param=w1, bias=b, param_0=w0)
tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b)
tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b)
tmp = conv_bn_pool(tmp, 2, [128, 128], param=w1, bias=b)
return tmp
def encoder_net(images,
num_classes,
rnn_hidden_size=200,
regularizer=None,
gradient_clip=None):
conv_features = ocr_convs(
images, 8, True, regularizer=regularizer, gradient_clip=gradient_clip)
sliced_feature = fluid.layers.im2sequence(
input=conv_features,
stride=[1, 1],
filter_size=[conv_features.shape[2], 1])
para_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02))
bias_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02),
learning_rate=2.0)
bias_attr_nobias = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02))
fc_1 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=bias_attr_nobias)
fc_2 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=bias_attr_nobias)
gru_forward = fluid.layers.dynamic_gru(
input=fc_1,
size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
gru_backward = fluid.layers.dynamic_gru(
input=fc_2,
size=rnn_hidden_size,
is_reverse=True,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
w_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02))
b_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.0))
fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
size=num_classes + 1,
param_attr=w_attr,
bias_attr=b_attr)
return fc_out
def ctc_train_net(images, label, args, num_classes):
regularizer = fluid.regularizer.L2Decay(args.l2)
# gradient_clip=fluid.clip.GradientClipByValue(args.max_clip, args.min_clip)
gradient_clip = None
fc_out = encoder_net(
images,
num_classes,
regularizer=regularizer,
gradient_clip=gradient_clip)
cost = fluid.layers.warpctc(
input=fc_out,
label=label,
size=num_classes + 1,
blank=num_classes,
norm_by_times=True)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Momentum(
learning_rate=args.learning_rate, momentum=args.momentum)
optimizer.minimize(avg_cost)
# decoder and evaluator
decoded_out = fluid.layers.ctc_greedy_decoder(
input=fc_out, blank=num_classes)
casted_label = fluid.layers.cast(x=label, dtype='int64')
error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label)
return avg_cost, error_evaluator
"""Trainer for OCR CTC model."""
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import dummy_reader
import argparse
import functools
import sys
from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
from crnn_ctc_model import ctc_train_net
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('pass_num', int, 32, "# of training epochs.")
add_arg('log_period', int, 1000, "Log period.")
add_arg('learning_rate', float, 1.0e-3, "Learning rate.")
add_arg('l2', float, 0.0004, "L2 regularizer.")
add_arg('max_clip', float, 10.0, "Max clip threshold.")
add_arg('min_clip', float, -10.0, "Min clip threshold.")
add_arg('momentum', float, 0.9, "Momentum.")
add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.")
add_arg('device', int, 0, "Device id.'-1' means running on CPU"
"while '0' means GPU-0.")
# yapf: disable
def train(args, data_reader=dummy_reader):
"""OCR CTC training"""
num_classes = data_reader.num_classes()
data_shape = data_reader.data_shape()
# define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1)
avg_cost, error_evaluator = ctc_train_net(images, label, args, num_classes)
# data reader
train_reader = data_reader.train(args.batch_size)
test_reader = data_reader.test()
# prepare environment
place = fluid.CPUPlace()
if args.device >= 0:
place = fluid.CUDAPlace(args.device)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
inference_program = fluid.io.get_inference_program(error_evaluator)
for pass_id in range(args.pass_num):
error_evaluator.reset(exe)
batch_id = 0
# train a pass
for data in train_reader():
loss, batch_edit_distance = exe.run(
fluid.default_main_program(),
feed=get_feeder_data(data, place),
fetch_list=[avg_cost] + error_evaluator.metrics)
if batch_id % args.log_period == 0:
print "Pass[%d]-batch[%d]; Loss: %s; Word error: %s." % (
pass_id, batch_id, loss[0], batch_edit_distance[0] / float(args.batch_size))
sys.stdout.flush()
batch_id += 1
train_edit_distance = error_evaluator.eval(exe)
print "End pass[%d]; Train word error: %s.\n" % (
pass_id, str(train_edit_distance[0]))
# evaluate model on test data
error_evaluator.reset(exe)
for data in test_reader():
exe.run(inference_program, feed=get_feeder_data(data, place))
test_edit_distance = error_evaluator.eval(exe)
print "End pass[%d]; Test word error: %s.\n" % (
pass_id, str(test_edit_distance[0]))
def main():
args = parser.parse_args()
print_arguments(args)
train(args, data_reader=dummy_reader)
if __name__ == "__main__":
main()
......@@ -14,6 +14,8 @@
#limitations under the License.
import numpy as np
import paddle.v2 as paddle
DATA_SHAPE = [1, 512, 512]
NUM_CLASSES = 20
......@@ -30,14 +32,14 @@ def _read_creater(num_sample=1024, min_seq_len=1, max_seq_len=10):
return reader
def train(num_sample=16):
def train(batch_size, num_sample=16):
"""Get train dataset reader."""
return _read_creater(num_sample=num_sample)
return paddle.batch(_read_creater(num_sample=num_sample), batch_size)
def test(num_sample=16):
def test(batch_size=1, num_sample=16):
"""Get test dataset reader."""
return _read_creater(num_sample=num_sample)
return paddle.batch(_read_creater(num_sample=num_sample), batch_size)
def data_shape():
......
......@@ -17,6 +17,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
from paddle.v2.fluid import core
def print_arguments(args):
......@@ -58,3 +60,28 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int32")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def get_feeder_data(data, place):
pixel_tensor = core.LoDTensor()
pixel_data = None
pixel_data = np.concatenate(
map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32")
pixel_tensor.set(pixel_data, place)
label_tensor = to_lodtensor(map(lambda x: x[1], data), place)
return {"pixel": pixel_tensor, "label": label_tensor}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册