attention-ocr模型加载保存参数问题
Created by: xiangyubo
标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错 ”
- 版本、环境信息: 1)PaddlePaddle版本:1.4.1 3)GPU:v100 4)系统环境:Python 3.6
- 模型信息 1)模型名称:attention-ocr 2)使用数据集名称:自己准备的数据集 3)使用算法名称 4)模型链接:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition
我自己稍微改了一下 attention 模型的写法,使用自己准备的一个小数据集。数据量大约 1w 出头。我训练的时候,从loss 和编辑距离,预估准确率上看都很正常。在训练的时候,我会使用save_persistables保存当前预估准确率最好的模型参数。 训练结束后,我想把保存的参数通过 save_inference_model 转化成预测形式保存的参数。所以我的思路是先用 load_persistables 加载进来,然后再 save_inference_model 保存。如果我没有调用自己改写的 infer,整个过程不会报错。如果我调用自己改写 infer,在重新加载参数的时候会报错说找不到 conv_8 的参数。但实际上模型在卷积部分只有0-7号.....所以很懵逼
以下是我的代码: 训练部分 `
# -*- coding: UTF-8 -*-
"""
训练基于attention-ocr的网络,文字行识别
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import uuid
import numpy as np
import time
import six
import math
import random
import paddle
import paddle.fluid as fluid
import logging
import xml.etree.ElementTree
import codecs
import json
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from PIL import Image, ImageEnhance, ImageDraw
logger = None
train_parameters = {
"input_size": [1, 48, 512],
"data_dir": "data/data6927/word-recognition",
"train_dir": "trainImageSet",
"eval_dir": "evalImageSet",
"train_list": "train.txt",
"eval_list": "eval.txt",
"label_list": "label_list.txt",
"class_dim": -1,
"label_dict": {},
"image_count": -1,
"continue_train": True,
"pretrained": False,
"pretrained_model_dir": "./pretrained-model",
"save_model_dir": "./attention-ocr-model",
"num_epochs": 250,
"train_batch_size": 256,
"use_gpu": True,
"decoder_size": 128,
"word_vector_dim": 128,
"max_char_length": 40, # 最大识别字符串长度
"gradient_clip": 10,
"sos": 0,
"eos": 1,
"mean_color": 127.5,
"mode": "train",
"multi_data_reader_count": 4,
"apply_distort": True,
"image_distort_strategy": {
"expand_prob": 0.5,
"expand_max_ratio": 2,
"hue_prob": 0.5,
"hue_delta": 18,
"contrast_prob": 0.5,
"contrast_delta": 0.5,
"saturation_prob": 0.5,
"saturation_delta": 0.5,
"brightness_prob": 0.5,
"brightness_delta": 0.125
},
"sgd_strategy": {
"learning_rate": 0.001,
"lr_epochs": [70, 140, 200],
"lr_decay": [1, 0.5, 0.1, 0.05]
},
"early_stop": {
"sample_frequency": 50,
"successive_limit": 3,
"min_accuracy": 0.95
}
}
class AttentionOCR(object):
def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict):
self.outputs = None
self.decoder_size = decoder_size
self.word_vector_dim = word_vector_dim
self.label_dict = label_dict
self.max_char_length = max_char_length
self.num_classes = num_classes
def name(self):
return 'attention-ocr'
def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False):
tmp = input
for i in six.moves.xrange(group):
filter_size = 3
conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std))
tmp = fluid.layers.conv2d(
input=tmp,
num_filters=out_ch[i],
filter_size=3,
padding=1,
bias_attr=False,
param_attr=conv_param,
act=None, # LinearActivation
use_cudnn=use_cudnn)
tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
if pooling:
tmp = fluid.layers.pool2d(
input=tmp,
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=use_cudnn,
ceil_mode=True)
return tmp
def ocr_convs(self, input, is_test=False, use_cudnn=True):
tmp = input
tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn)
return tmp
def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)
sliced_feature = fluid.layers.im2sequence(
input=conv_features,
stride=[1, 1],
filter_size=[conv_features.shape[2], 1])
para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
fc_1 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False)
fc_2 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False)
gru_forward = fluid.layers.dynamic_gru(
input=fc_1,
size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
gru_backward = fluid.layers.dynamic_gru(
input=fc_2,
size=rnn_hidden_size,
is_reverse=True,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
encoded_vector = fluid.layers.concat(
input=[gru_forward, gru_backward], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=self.decoder_size,
bias_attr=False)
return gru_backward, encoded_vector, encoded_proj
def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot):
def simple_attention(encoder_vec, encoder_proj, decoder_state):
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=self.decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = encoder_proj + decoder_state_expand
concated = fluid.layers.tanh(x=concated)
attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
attention_weights = fluid.layers.sequence_softmax( input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
rnn = fluid.layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False)
fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False)
decoder_inputs = fc_1 + fc_2
h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3)
rnn.update_memory(hidden_mem, h)
out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax')
rnn.output(out)
return rnn()
def net(self, images, label_in):
gru_backward, encoded_vector, encoded_proj = self.encoder_net(images)
backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size,
bias_attr=False, act="relu")
label_in = fluid.layers.cast(x=label_in, dtype='int64')
trg_embedding = fluid.layers.embedding(
input=label_in,
size=[self.num_classes + 2, self.word_vector_dim],
dtype='float32')
prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector,
encoded_proj, decoder_boot)
return prediction
def infer(self, images, use_cudnn=True):
beam_size = 1
gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn)
backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size,
bias_attr=False, act="relu")
init_state = decoder_boot
array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length)
counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = fluid.layers.create_array('float32')
fluid.layers.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = fluid.layers.create_array('int64')
scores_array = fluid.layers.create_array('float32')
init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2)
fluid.layers.array_write(init_ids, array=ids_array, i=counter)
fluid.layers.array_write(init_scores, array=scores_array, i=counter)
cond = fluid.layers.less_than(x=counter, y=array_len)
while_op = fluid.layers.While(cond=cond)
with while_op.block():
pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
pre_state = fluid.layers.array_read(array=state_array, i=counter)
pre_score = fluid.layers.array_read(array=scores_array, i=counter)
pre_ids_emb = fluid.layers.embedding(
input=pre_ids,
size=[self.num_classes + 2, self.word_vector_dim],
dtype='float32')
context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size)
# expand the recursive_sequence_lengths of pre_state to be the same with pre_score
pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
context_expanded = fluid.layers.sequence_expand(context, pre_score)
fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False)
fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False)
decoder_inputs = fc_1 + fc_2
current_state, _, _ = fluid.layers.gru_unit(
input=decoder_inputs,
hidden=pre_state_expanded,
size=self.decoder_size * 3)
current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score)
# use score to do beam search
current_score = fluid.layers.fc(input=current_state_with_lod,
size=self.num_classes + 2,
bias_attr=True,
act='softmax')
topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size)
# calculate accumulated scores after topk to reduce computation cost
accu_scores = fluid.layers.elementwise_add(
x=fluid.layers.log(topk_scores),
y=fluid.layers.reshape(pre_score, shape=[-1]),
axis=0)
selected_ids, selected_scores = fluid.layers.beam_search(
pre_ids,
pre_score,
topk_indices,
accu_scores,
beam_size,
1, # end_id
#level=0
)
fluid.layers.increment(x=counter, value=1, in_place=True)
# update the memories
fluid.layers.array_write(current_state, array=state_array, i=counter)
fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
# update the break condition: up to the max length or all candidates of
# source sentences have ended.
length_cond = fluid.layers.less_than(x=counter, y=array_len)
finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids))
fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
beam_size, eos)
return ids
def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size):
decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj)
concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
concated = fluid.layers.tanh(x=concated)
attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
def init_train_parameters():
"""
初始化训练参数,主要是初始化图片数量,类别数
:return:
"""
train_list = os.path.join(train_parameters['data_dir'], train_parameters['train_list'])
label_list = os.path.join(train_parameters['data_dir'], train_parameters['label_list'])
index = 0
with codecs.open(label_list, encoding='utf-8') as flist:
lines = [line.strip() for line in flist]
for line in lines:
parts = line.split()
train_parameters['label_dict'][parts[0]] = int(parts[1])
index += 1
train_parameters['class_dim'] = index
with codecs.open(train_list, encoding='utf-8') as flist:
lines = [line.strip() for line in flist]
train_parameters['image_count'] = len(lines)
def init_log_config():
"""
初始化日志相关配置
:return:
"""
global logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
log_path = os.path.join(os.getcwd(), 'logs')
if not os.path.exists(log_path):
os.makedirs(log_path)
log_name = os.path.join(log_path, 'train.log')
sh = logging.StreamHandler()
fh = logging.FileHandler(log_name, mode='w')
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
sh.setFormatter(formatter)
logger.addHandler(sh)
logger.addHandler(fh)
def resize_img(img, input_size):
target_size = input_size
percent_h = float(target_size[1]) / img.size[1]
percent_w = float(target_size[2]) / img.size[0]
percent = min(percent_h, percent_w)
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
w_off = (target_size[2] - resized_width) / 2
h_off = (target_size[1] - resized_height) / 2
img = img.resize((resized_width, resized_height), Image.ANTIALIAS)
array = np.ndarray((target_size[1], target_size[2], 3), np.uint8)
array[:, :, 0] = 127
array[:, :, 1] = 127
array[:, :, 2] = 127
ret = Image.fromarray(array)
ret.paste(img, (np.random.randint(0, w_off + 1), int(h_off)))
return ret
def random_brightness(img):
prob = np.random.uniform(0, 1)
if prob < train_parameters['image_distort_strategy']['brightness_prob']:
brightness_delta = train_parameters['image_distort_strategy']['brightness_delta']
delta = np.random.uniform(-brightness_delta, brightness_delta) + 1
img = ImageEnhance.Brightness(img).enhance(delta)
return img
def random_contrast(img):
prob = np.random.uniform(0, 1)
if prob < train_parameters['image_distort_strategy']['contrast_prob']:
contrast_delta = train_parameters['image_distort_strategy']['contrast_delta']
delta = np.random.uniform(-contrast_delta, contrast_delta) + 1
img = ImageEnhance.Contrast(img).enhance(delta)
return img
def random_saturation(img):
prob = np.random.uniform(0, 1)
if prob < train_parameters['image_distort_strategy']['saturation_prob']:
saturation_delta = train_parameters['image_distort_strategy']['saturation_delta']
delta = np.random.uniform(-saturation_delta, saturation_delta) + 1
img = ImageEnhance.Color(img).enhance(delta)
return img
def random_hue(img):
prob = np.random.uniform(0, 1)
if prob < train_parameters['image_distort_strategy']['hue_prob']:
hue_delta = train_parameters['image_distort_strategy']['hue_delta']
delta = np.random.uniform(-hue_delta, hue_delta)
img_hsv = np.array(img.convert('HSV'))
img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
return img
def distort_image(img):
prob = np.random.uniform(0, 1)
# Apply different distort order
if prob > 0.5:
img = random_brightness(img)
img = random_contrast(img)
img = random_saturation(img)
img = random_hue(img)
else:
img = random_brightness(img)
img = random_saturation(img)
img = random_hue(img)
img = random_contrast(img)
return img
def rotate_image(img):
"""
图像增强,增加随机旋转角度
"""
prob = np.random.uniform(0, 1)
if prob > 0.5:
angle = np.random.randint(-8, 8)
img = img.rotate(angle)
return img
def random_expand(img, keep_ratio=True):
if np.random.uniform(0, 1) < train_parameters['image_distort_strategy']['expand_prob']:
return img
max_ratio = train_parameters['image_distort_strategy']['expand_max_ratio']
w, h = img.size
c = 3
ratio_x = random.uniform(1, max_ratio)
if keep_ratio:
ratio_y = ratio_x
else:
ratio_y = random.uniform(1, max_ratio)
oh = int(h * ratio_y)
ow = int(w * ratio_x)
off_x = random.randint(0, ow - w)
off_y = random.randint(0, oh - h)
out_img = np.zeros((oh, ow, c), np.uint8)
for i in range(c):
out_img[:, :, i] = train_parameters['mean_color']
out_img[off_y: off_y + h, off_x: off_x + w, :] = img
return Image.fromarray(out_img)
def preprocess(img, input_size):
img_width, img_height = img.size
if train_parameters['apply_distort']:
img = distort_image(img)
img = random_expand(img)
img = rotate_image(img)
# img = resize_img(img, input_size)
# img = img.convert('L')
# img = np.array(img).astype('float32') - train_parameters['mean_color']
# img *= 0.007843
return img
def custom_reader(file_list, data_dir, input_size, mode):
def reader():
np.random.shuffle(file_list)
for line in file_list:
# img_name, label
parts = line.split()
image_path = parts[0]
img = Image.open(image_path)
# img = Image.open(os.path.join(data_dir, image_path))
if img.mode != 'RGB':
img = img.convert('RGB')
label = [int(train_parameters['label_dict'][c]) for c in parts[-1]]
if len(label) == 0:
continue
if mode == 'train':
img = preprocess(img, input_size)
img = resize_img(img, input_size)
img = img.convert('L')
# img.save(image_path)
img = np.array(img).astype('float32') - train_parameters['mean_color']
# img *= 0.007843
img = img[np.newaxis, ...]
# print("{0} {1}".format(image_path, label))
sos = train_parameters['sos']
eos = train_parameters['eos']
yield img, [sos] + label, label + [eos]
return reader
def multi_process_custom_reader(file_path, data_dir, num_workers, input_size, mode):
file_path = os.path.join(data_dir, file_path)
readers = []
images = [line.strip() for line in open(file_path)]
n = int(math.ceil(len(images) // num_workers))
image_lists = [images[i: i + n] for i in range(0, len(images), n)]
train_path = os.path.join(train_parameters['data_dir'], train_parameters['train_dir'])
for l in image_lists:
reader = paddle.batch(custom_reader(l, train_path, input_size, mode),
batch_size=train_parameters['train_batch_size'])
readers.append(paddle.reader.shuffle(reader, train_parameters['train_batch_size']))
return paddle.reader.multiprocess_reader(readers, False)
def create_eval_reader(file_path, data_dir, input_size, mode):
file_path = os.path.join(data_dir, file_path)
images = [line.strip() for line in open(file_path)]
eval_path = os.path.join(train_parameters['data_dir'], train_parameters['eval_dir'])
return paddle.batch(custom_reader(images, eval_path, input_size, mode),
batch_size=train_parameters['train_batch_size'])
def optimizer_sgd_setting():
batch_size = train_parameters["train_batch_size"]
iters = train_parameters["image_count"] // batch_size
learning_strategy = train_parameters['sgd_strategy']
lr = learning_strategy['learning_rate']
boundaries = [i * iters for i in learning_strategy["lr_epochs"]]
values = [i * lr for i in learning_strategy["lr_decay"]]
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.00005))
return optimizer
def build_train_program_with_async_reader(main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1)
data_reader = fluid.layers.create_py_reader_by_data(capacity=train_parameters['train_batch_size'],
feed_list=[img, label_in, label_out],
name='train')
multi_reader = multi_process_custom_reader(train_parameters['train_list'],
train_parameters['data_dir'],
train_parameters['multi_data_reader_count'],
train_parameters['input_size'],
'train')
data_reader.decorate_paddle_reader(multi_reader)
img, label_in, label_out = fluid.layers.read_file(data_reader)
loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out)
return data_reader, loss, distances, seq_num, decoded_out
def build_eval_program_with_feeder(main_prog, startup_prog, place):
with fluid.program_guard(main_prog, startup_prog):
img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1)
feeder = fluid.DataFeeder(feed_list=[img, label_in, label_out], place=place, program=main_prog)
reader = create_eval_reader(train_parameters['eval_list'],
train_parameters['data_dir'],
train_parameters['input_size'],
'eval')
loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out)
return reader, loss, distances, seq_num, decoded_out
def get_loss(img, label_in, label_out):
with fluid.unique_name.guard():
class_dim = train_parameters['class_dim']
decoder_size = train_parameters['decoder_size']
word_vector_dim = train_parameters['word_vector_dim']
label_dict = train_parameters['label_dict']
max_char_length = train_parameters['max_char_length']
model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, label_dict)
prediction = model.net(img, label_in)
label_out = fluid.layers.cast(x=label_out, dtype='int64')
cost = fluid.layers.cross_entropy(input=prediction, label=label_out)
loss = fluid.layers.reduce_sum(cost)
optimizer = optimizer_sgd_setting()
optimizer.minimize(loss)
_, decoded_out = fluid.layers.topk(input=prediction, k=1)
casted_label = fluid.layers.cast(x=label_out, dtype='int64')
sos = train_parameters['sos']
eos = train_parameters['eos']
distances, seq_num = fluid.layers.edit_distance(decoded_out, label_out, ignored_tokens=[sos, eos])
return loss, distances, seq_num, decoded_out
def load_pretrained_params(exe, program):
if train_parameters['continue_train'] and os.path.exists(train_parameters['save_model_dir']):
logger.info('load param from retrain model')
fluid.io.load_persistables(executor=exe,
dirname=train_parameters['save_model_dir'],
main_program=program)
elif train_parameters['pretrained'] and os.path.exists(train_parameters['pretrained_model_dir']):
logger.info('load param from pretrained model')
def if_exist(var):
return os.path.exists(os.path.join(train_parameters['pretrained_model_dir'], var.name))
fluid.io.load_vars(exe, train_parameters['pretrained_model_dir'], main_program=program,
predicate=if_exist)
def train():
init_log_config()
init_train_parameters()
logger.info("start train attention-ocr, train params:%s", str(train_parameters))
logger.info("create place, use gpu:" + str(train_parameters['use_gpu']))
place = fluid.CUDAPlace(0) if train_parameters['use_gpu'] else fluid.CPUPlace()
logger.info("build network and program")
train_program = fluid.Program()
train_start_program = fluid.Program()
eval_program = fluid.Program()
eval_start_program = fluid.Program()
train_reader, loss, distances, seq_num, decoded_out = build_train_program_with_async_reader(train_program,
train_start_program)
# eval_reader, eval_loss, eval_distances, eval_seq_num, eval_decoded_out = build_eval_program_with_feeder(eval_program, eval_start_program, place)
# eval_program = eval_program.clone(for_test=True)
logger.info("build executor and init params")
exe = fluid.Executor(place)
exe.run(train_start_program)
train_fetch_list = [loss.name, distances.name, seq_num.name, decoded_out.name]
# eval_fetch_list = [output.name]
load_pretrained_params(exe, train_program)
stop_strategy = train_parameters['early_stop']
successive_limit = stop_strategy['successive_limit']
sample_freq = stop_strategy['sample_frequency']
min_accuracy = stop_strategy['min_accuracy']
current_best_accuracy = 0.0
stop_train = False
successive_count = 0
total_batch_count = 0
distance_evaluator = fluid.metrics.EditDistance("edit-distance")
for pass_id in range(train_parameters["num_epochs"]):
logger.info("current pass: %d, start read image", pass_id)
batch_id = 0
train_reader.start()
distance_evaluator.reset()
try:
while True:
t1 = time.time()
loss, distances, seq_num, decoded_out = exe.run(train_program, fetch_list=train_fetch_list,
return_numpy=False)
distances = np.array(distances)
seq_num = np.array(seq_num)
distance_evaluator.update(distances, seq_num)
period = time.time() - t1
loss = np.mean(np.array(loss))
batch_id += 1
total_batch_count += 1
if batch_id % 10 == 0:
distance, instance_error = distance_evaluator.eval()
# logger.info(np.array(decoded_out))
logger.info("Pass {0}, trainbatch {1}, loss {2} distance {3} instance error {4} time {5}"
.format(pass_id, batch_id, loss, distance, instance_error, "%2.2f sec" % period))
except fluid.core.EOFException:
train_reader.reset()
distance, instance_error = distance_evaluator.eval()
logger.info("Pass {0} distance {1} instance error {2}".format(pass_id, distance, instance_error))
if 1.0 - instance_error >= current_best_accuracy:
logger.info("temp save pass {0} train result, current bset accuracy {1}".format(pass_id, 1.0 - instance_error))
current_best_accuracy = 1.0 - instance_error
fluid.io.save_persistables(dirname=train_parameters['save_model_dir'], main_program=train_program, executor=exe)
logger.info("training till last epcho, end training")
if __name__ == '__main__':
train()
`
把模型保存成推理形式的代码: `
# -*- coding: UTF-8 -*-
"""
固化基于attention-ocr的网络,文字行识别
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import random
import time
import codecs
import sys
import six
import functools
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.param_attr import ParamAttr
from PIL import Image, ImageEnhance
class_dim = 63
decoder_size = 128
word_vector_dim = 128
target_size = [1, 48, 512]
mean_rgb = 127.5
use_gpu = True
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
sos = 0
eos = 1
max_char_length = 40
save_freeze_dir = "./attention-ocr-model"
class AttentionOCR(object):
def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict):
self.outputs = None
self.decoder_size = decoder_size
self.word_vector_dim = word_vector_dim
self.label_dict = label_dict
self.max_char_length = max_char_length
self.num_classes = num_classes
def name(self):
return 'attention-ocr'
def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False):
tmp = input
for i in six.moves.xrange(group):
filter_size = 3
conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std))
tmp = fluid.layers.conv2d(
input=tmp,
num_filters=out_ch[i],
filter_size=3,
padding=1,
bias_attr=False,
param_attr=conv_param,
act=None, # LinearActivation
use_cudnn=use_cudnn)
tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
if pooling:
tmp = fluid.layers.pool2d(
input=tmp,
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=use_cudnn,
ceil_mode=True)
return tmp
def ocr_convs(self, input, is_test=False, use_cudnn=True):
tmp = input
tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn)
return tmp
def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)
sliced_feature = fluid.layers.im2sequence(
input=conv_features,
stride=[1, 1],
filter_size=[conv_features.shape[2], 1])
para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)
fc_1 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False)
fc_2 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=False)
gru_forward = fluid.layers.dynamic_gru(
input=fc_1,
size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
gru_backward = fluid.layers.dynamic_gru(
input=fc_2,
size=rnn_hidden_size,
is_reverse=True,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
encoded_vector = fluid.layers.concat(
input=[gru_forward, gru_backward], axis=1)
encoded_proj = fluid.layers.fc(input=encoded_vector,
size=self.decoder_size,
bias_attr=False)
return gru_backward, encoded_vector, encoded_proj
def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot):
def simple_attention(encoder_vec, encoder_proj, decoder_state):
decoder_state_proj = fluid.layers.fc(input=decoder_state,
size=self.decoder_size,
bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = encoder_proj + decoder_state_expand
concated = fluid.layers.tanh(x=concated)
attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
attention_weights = fluid.layers.sequence_softmax( input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
rnn = fluid.layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False)
fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False)
decoder_inputs = fc_1 + fc_2
h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3)
rnn.update_memory(hidden_mem, h)
out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax')
rnn.output(out)
return rnn()
def net(self, images, label_in):
gru_backward, encoded_vector, encoded_proj = self.encoder_net(images)
backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size,
bias_attr=False, act="relu")
label_in = fluid.layers.cast(x=label_in, dtype='int64')
trg_embedding = fluid.layers.embedding(
input=label_in,
size=[self.num_classes + 2, self.word_vector_dim],
dtype='float32')
prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector,
encoded_proj, decoder_boot)
return prediction
def infer(self, images, use_cudnn=True):
beam_size = 1
gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn)
backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size,
bias_attr=False, act="relu")
init_state = decoder_boot
array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length)
counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = fluid.layers.create_array('float32')
fluid.layers.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = fluid.layers.create_array('int64')
scores_array = fluid.layers.create_array('float32')
init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2)
fluid.layers.array_write(init_ids, array=ids_array, i=counter)
fluid.layers.array_write(init_scores, array=scores_array, i=counter)
cond = fluid.layers.less_than(x=counter, y=array_len)
while_op = fluid.layers.While(cond=cond)
with while_op.block():
pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
pre_state = fluid.layers.array_read(array=state_array, i=counter)
pre_score = fluid.layers.array_read(array=scores_array, i=counter)
pre_ids_emb = fluid.layers.embedding(
input=pre_ids,
size=[self.num_classes + 2, self.word_vector_dim],
dtype='float32')
context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size)
# expand the recursive_sequence_lengths of pre_state to be the same with pre_score
pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
context_expanded = fluid.layers.sequence_expand(context, pre_score)
fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False)
fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False)
decoder_inputs = fc_1 + fc_2
current_state, _, _ = fluid.layers.gru_unit(
input=decoder_inputs,
hidden=pre_state_expanded,
size=self.decoder_size * 3)
current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score)
# use score to do beam search
current_score = fluid.layers.fc(input=current_state_with_lod,
size=self.num_classes + 2,
bias_attr=True,
act='softmax')
topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size)
# calculate accumulated scores after topk to reduce computation cost
accu_scores = fluid.layers.elementwise_add(
x=fluid.layers.log(topk_scores),
y=fluid.layers.reshape(pre_score, shape=[-1]),
axis=0)
selected_ids, selected_scores = fluid.layers.beam_search(
pre_ids,
pre_score,
topk_indices,
accu_scores,
beam_size,
1, # end_id
#level=0
)
fluid.layers.increment(x=counter, value=1, in_place=True)
# update the memories
fluid.layers.array_write(current_state, array=state_array, i=counter)
fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
fluid.layers.array_write(selected_scores, array=scores_array, i=counter)
# update the break condition: up to the max length or all candidates of
# source sentences have ended.
length_cond = fluid.layers.less_than(x=counter, y=array_len)
finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids))
fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)
ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
beam_size, eos)
return ids
def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size):
decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False)
decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj)
concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
concated = fluid.layers.tanh(x=concated)
attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0)
context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
return context
def freeze_model():
exe = fluid.Executor(fluid.CPUPlace())
image = fluid.layers.data(name='image', shape=target_size, dtype='float32')
label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, {})
pred = model.net(image, label_in)
out = model.infer(image)
freeze_program = fluid.default_main_program()
exe.run(fluid.Program())
fluid.io.load_persistables(exe, save_freeze_dir, freeze_program)
freeze_program = freeze_program.clone(for_test=True)
fluid.io.save_inference_model("./freeze_model", ['image'], out, exe, freeze_program)
if __name__ == '__main__':
freeze_model()
`