Skip to content

  • 体验新版
    • 正在加载...
  • 登录
  • PaddlePaddle
  • Paddle
  • Issue
  • #18271

P
Paddle
  • 项目概览

PaddlePaddle / Paddle
大约 2 年 前同步成功

通知 2325
Star 20933
Fork 5424
  • 代码
    • 文件
    • 提交
    • 分支
    • Tags
    • 贡献者
    • 分支图
    • Diff
  • Issue 1423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
  • Wiki 0
    • Wiki
  • 分析
    • 仓库
    • DevOps
  • 项目成员
  • Pages
P
Paddle
  • 项目概览
    • 项目概览
    • 详情
    • 发布
  • 仓库
    • 仓库
    • 文件
    • 提交
    • 分支
    • 标签
    • 贡献者
    • 分支图
    • 比较
  • Issue 1,423
    • Issue 1,423
    • 列表
    • 看板
    • 标记
    • 里程碑
  • 合并请求 543
    • 合并请求 543
  • Pages
  • 分析
    • 分析
    • 仓库分析
    • DevOps
  • Wiki 0
    • Wiki
  • 成员
    • 成员
  • 收起侧边栏
  • 动态
  • 分支图
  • 创建新Issue
  • 提交
  • Issue看板
已关闭
开放中
Opened 6月 23, 2019 by saxon_zh@saxon_zhGuest

attention-ocr模型加载保存参数问题

Created by: xiangyubo

标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错  ”

  • 版本、环境信息:    1)PaddlePaddle版本:1.4.1    3)GPU:v100    4)系统环境:Python 3.6
  • 模型信息    1)模型名称:attention-ocr 2)使用数据集名称:自己准备的数据集 3)使用算法名称 4)模型链接:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition

我自己稍微改了一下 attention 模型的写法,使用自己准备的一个小数据集。数据量大约 1w 出头。我训练的时候,从loss 和编辑距离,预估准确率上看都很正常。在训练的时候,我会使用save_persistables保存当前预估准确率最好的模型参数。 训练结束后,我想把保存的参数通过 save_inference_model 转化成预测形式保存的参数。所以我的思路是先用 load_persistables 加载进来,然后再 save_inference_model 保存。如果我没有调用自己改写的 infer,整个过程不会报错。如果我调用自己改写 infer,在重新加载参数的时候会报错说找不到 conv_8 的参数。但实际上模型在卷积部分只有0-7号.....所以很懵逼

以下是我的代码: 训练部分 `

# -*- coding: UTF-8 -*-
"""
训练基于attention-ocr的网络,文字行识别
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import uuid
import numpy as np
import time
import six
import math
import random
import paddle
import paddle.fluid as fluid
import logging
import xml.etree.ElementTree
import codecs
import json

from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.regularizer import L2Decay
from PIL import Image, ImageEnhance, ImageDraw

logger = None
train_parameters = {
    "input_size": [1, 48, 512],
    "data_dir": "data/data6927/word-recognition",
    "train_dir": "trainImageSet",
    "eval_dir": "evalImageSet",
    "train_list": "train.txt",
    "eval_list": "eval.txt",
    "label_list": "label_list.txt",
    "class_dim": -1,
    "label_dict": {},
    "image_count": -1,
    "continue_train": True,
    "pretrained": False,
    "pretrained_model_dir": "./pretrained-model",
    "save_model_dir": "./attention-ocr-model",
    "num_epochs": 250,
    "train_batch_size": 256,
    "use_gpu": True,
    "decoder_size": 128,
    "word_vector_dim": 128,
    "max_char_length": 40,      # 最大识别字符串长度
    "gradient_clip": 10,
    "sos": 0,
    "eos": 1,
    "mean_color": 127.5,
    "mode": "train",
    "multi_data_reader_count": 4,
    "apply_distort": True,
    "image_distort_strategy": {
        "expand_prob": 0.5,
        "expand_max_ratio": 2,
        "hue_prob": 0.5,
        "hue_delta": 18,
        "contrast_prob": 0.5,
        "contrast_delta": 0.5,
        "saturation_prob": 0.5,
        "saturation_delta": 0.5,
        "brightness_prob": 0.5,
        "brightness_delta": 0.125
    },
    "sgd_strategy": {
        "learning_rate": 0.001,
        "lr_epochs": [70, 140, 200],
        "lr_decay": [1, 0.5, 0.1, 0.05]
    },
    "early_stop": {
        "sample_frequency": 50,
        "successive_limit": 3,
        "min_accuracy": 0.95
    }
}


class AttentionOCR(object):

    def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict):
        self.outputs = None
        self.decoder_size = decoder_size
        self.word_vector_dim = word_vector_dim
        self.label_dict = label_dict
        self.max_char_length = max_char_length
        self.num_classes = num_classes

    def name(self):
        return 'attention-ocr'

    def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False):
        tmp = input
        for i in six.moves.xrange(group):
            filter_size = 3
            conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
            conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std))
            tmp = fluid.layers.conv2d(
                input=tmp,
                num_filters=out_ch[i],
                filter_size=3,
                padding=1,
                bias_attr=False,
                param_attr=conv_param,
                act=None,  # LinearActivation
                use_cudnn=use_cudnn)
            tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
        if pooling:
            tmp = fluid.layers.pool2d(
                input=tmp,
                pool_size=2,
                pool_type='max',
                pool_stride=2,
                use_cudnn=use_cudnn,
                ceil_mode=True)

        return tmp

    def ocr_convs(self, input, is_test=False, use_cudnn=True):
        tmp = input
        tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn)

        return tmp

    def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
        conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)

        sliced_feature = fluid.layers.im2sequence(
            input=conv_features,
            stride=[1, 1],
            filter_size=[conv_features.shape[2], 1])

        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)

        fc_1 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)
        fc_2 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)

        gru_forward = fluid.layers.dynamic_gru(
            input=fc_1,
            size=rnn_hidden_size,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')
        gru_backward = fluid.layers.dynamic_gru(
            input=fc_2,
            size=rnn_hidden_size,
            is_reverse=True,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')

        encoded_vector = fluid.layers.concat(
            input=[gru_forward, gru_backward], axis=1)
        encoded_proj = fluid.layers.fc(input=encoded_vector,
                                       size=self.decoder_size,
                                       bias_attr=False)

        return gru_backward, encoded_vector, encoded_proj

    def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot):
        def simple_attention(encoder_vec, encoder_proj, decoder_state):
            decoder_state_proj = fluid.layers.fc(input=decoder_state,
                                                 size=self.decoder_size,
                                                 bias_attr=False)
            decoder_state_expand = fluid.layers.sequence_expand(
                x=decoder_state_proj, y=encoder_proj)
            concated = encoder_proj + decoder_state_expand
            concated = fluid.layers.tanh(x=concated)
            attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
            attention_weights = fluid.layers.sequence_softmax( input=attention_weights)
            weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
            scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0)
            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
            return context

        rnn = fluid.layers.DynamicRNN()

        with rnn.block():
            current_word = rnn.step_input(target_embedding)
            encoder_vec = rnn.static_input(encoder_vec)
            encoder_proj = rnn.static_input(encoder_proj)
            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
            fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False)
            decoder_inputs = fc_1 + fc_2
            h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3)
            rnn.update_memory(hidden_mem, h)
            out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax')
            rnn.output(out)
        return rnn()

    def net(self, images, label_in):

        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")

        label_in = fluid.layers.cast(x=label_in, dtype='int64')
        trg_embedding = fluid.layers.embedding(
            input=label_in,
            size=[self.num_classes + 2, self.word_vector_dim],
            dtype='float32')
        prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector,
                                                encoded_proj, decoder_boot)
        return prediction

    def infer(self, images, use_cudnn=True):
        beam_size = 1
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")
        init_state = decoder_boot
        array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length)
        counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)

        # fill the first element with init_state
        state_array = fluid.layers.create_array('float32')
        fluid.layers.array_write(init_state, array=state_array, i=counter)

        # ids, scores as memory
        ids_array = fluid.layers.create_array('int64')
        scores_array = fluid.layers.create_array('float32')

        init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
        init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2)

        fluid.layers.array_write(init_ids, array=ids_array, i=counter)
        fluid.layers.array_write(init_scores, array=scores_array, i=counter)

        cond = fluid.layers.less_than(x=counter, y=array_len)
        while_op = fluid.layers.While(cond=cond)
        with while_op.block():
            pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
            pre_state = fluid.layers.array_read(array=state_array, i=counter)
            pre_score = fluid.layers.array_read(array=scores_array, i=counter)

            pre_ids_emb = fluid.layers.embedding(
                input=pre_ids,
                size=[self.num_classes + 2, self.word_vector_dim],
                dtype='float32')

            context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size)

            # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
            pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
            context_expanded = fluid.layers.sequence_expand(context, pre_score)
            fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False)

            decoder_inputs = fc_1 + fc_2
            current_state, _, _ = fluid.layers.gru_unit(
                input=decoder_inputs,
                hidden=pre_state_expanded,
                size=self.decoder_size * 3)

            current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score)
            # use score to do beam search
            current_score = fluid.layers.fc(input=current_state_with_lod,
                                            size=self.num_classes + 2,
                                            bias_attr=True,
                                            act='softmax')
            topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size)

            # calculate accumulated scores after topk to reduce computation cost
            accu_scores = fluid.layers.elementwise_add(
                x=fluid.layers.log(topk_scores),
                y=fluid.layers.reshape(pre_score, shape=[-1]),
                axis=0)
            selected_ids, selected_scores = fluid.layers.beam_search(
                pre_ids,
                pre_score,
                topk_indices,
                accu_scores,
                beam_size,
                1,  # end_id
                #level=0
            )

            fluid.layers.increment(x=counter, value=1, in_place=True)

            # update the memories
            fluid.layers.array_write(current_state, array=state_array, i=counter)
            fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
            fluid.layers.array_write(selected_scores, array=scores_array, i=counter)

            # update the break condition: up to the max length or all candidates of
            # source sentences have ended.
            length_cond = fluid.layers.less_than(x=counter, y=array_len)
            finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids))
            fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
                                                      beam_size, eos)
        return ids

    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size):
        decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False)
        decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj)
        concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
        concated = fluid.layers.tanh(x=concated)
        attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
        attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
        scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0)
        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
        return context


def init_train_parameters():
    """
    初始化训练参数,主要是初始化图片数量,类别数
    :return:
    """
    train_list = os.path.join(train_parameters['data_dir'], train_parameters['train_list'])
    label_list = os.path.join(train_parameters['data_dir'], train_parameters['label_list'])
    index = 0
    with codecs.open(label_list, encoding='utf-8') as flist:
        lines = [line.strip() for line in flist]
        for line in lines:
            parts = line.split()
            train_parameters['label_dict'][parts[0]] = int(parts[1])
            index += 1
        train_parameters['class_dim'] = index
    with codecs.open(train_list, encoding='utf-8') as flist:
        lines = [line.strip() for line in flist]
        train_parameters['image_count'] = len(lines)


def init_log_config():
    """
    初始化日志相关配置
    :return:
    """
    global logger
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    log_path = os.path.join(os.getcwd(), 'logs')
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    log_name = os.path.join(log_path, 'train.log')
    sh = logging.StreamHandler()
    fh = logging.FileHandler(log_name, mode='w')
    fh.setLevel(logging.DEBUG)
    formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
    fh.setFormatter(formatter)
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    logger.addHandler(fh)


def resize_img(img, input_size):
    target_size = input_size
    percent_h = float(target_size[1]) / img.size[1]
    percent_w = float(target_size[2]) / img.size[0]
    percent = min(percent_h, percent_w)
    resized_width = int(round(img.size[0] * percent))
    resized_height = int(round(img.size[1] * percent))
    w_off = (target_size[2] - resized_width) / 2
    h_off = (target_size[1] - resized_height) / 2
    img = img.resize((resized_width, resized_height), Image.ANTIALIAS)
    array = np.ndarray((target_size[1], target_size[2], 3), np.uint8)
    array[:, :, 0] = 127
    array[:, :, 1] = 127
    array[:, :, 2] = 127
    ret = Image.fromarray(array)
    ret.paste(img, (np.random.randint(0, w_off + 1), int(h_off)))
    return ret


def random_brightness(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['brightness_prob']:
        brightness_delta = train_parameters['image_distort_strategy']['brightness_delta']
        delta = np.random.uniform(-brightness_delta, brightness_delta) + 1
        img = ImageEnhance.Brightness(img).enhance(delta)
    return img


def random_contrast(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['contrast_prob']:
        contrast_delta = train_parameters['image_distort_strategy']['contrast_delta']
        delta = np.random.uniform(-contrast_delta, contrast_delta) + 1
        img = ImageEnhance.Contrast(img).enhance(delta)
    return img


def random_saturation(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['saturation_prob']:
        saturation_delta = train_parameters['image_distort_strategy']['saturation_delta']
        delta = np.random.uniform(-saturation_delta, saturation_delta) + 1
        img = ImageEnhance.Color(img).enhance(delta)
    return img


def random_hue(img):
    prob = np.random.uniform(0, 1)
    if prob < train_parameters['image_distort_strategy']['hue_prob']:
        hue_delta = train_parameters['image_distort_strategy']['hue_delta']
        delta = np.random.uniform(-hue_delta, hue_delta)
        img_hsv = np.array(img.convert('HSV'))
        img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
        img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
    return img


def distort_image(img):
    prob = np.random.uniform(0, 1)
    # Apply different distort order
    if prob > 0.5:
        img = random_brightness(img)
        img = random_contrast(img)
        img = random_saturation(img)
        img = random_hue(img)
    else:
        img = random_brightness(img)
        img = random_saturation(img)
        img = random_hue(img)
        img = random_contrast(img)
    return img


def rotate_image(img):
    """
    图像增强,增加随机旋转角度
    """
    prob = np.random.uniform(0, 1)
    if prob > 0.5:
        angle = np.random.randint(-8, 8)
        img = img.rotate(angle)
    return img


def random_expand(img, keep_ratio=True):
    if np.random.uniform(0, 1) < train_parameters['image_distort_strategy']['expand_prob']:
        return img

    max_ratio = train_parameters['image_distort_strategy']['expand_max_ratio']
    w, h = img.size
    c = 3
    ratio_x = random.uniform(1, max_ratio)
    if keep_ratio:
        ratio_y = ratio_x
    else:
        ratio_y = random.uniform(1, max_ratio)
    oh = int(h * ratio_y)
    ow = int(w * ratio_x)
    off_x = random.randint(0, ow - w)
    off_y = random.randint(0, oh - h)

    out_img = np.zeros((oh, ow, c), np.uint8)
    for i in range(c):
        out_img[:, :, i] = train_parameters['mean_color']

    out_img[off_y: off_y + h, off_x: off_x + w, :] = img

    return Image.fromarray(out_img)


def preprocess(img, input_size):
    img_width, img_height = img.size
    if train_parameters['apply_distort']:
        img = distort_image(img)
    img = random_expand(img)
    img = rotate_image(img)
    # img = resize_img(img, input_size)
    # img = img.convert('L')
    # img = np.array(img).astype('float32') - train_parameters['mean_color']
    # img *= 0.007843
    return img


def custom_reader(file_list, data_dir, input_size, mode):
    def reader():
        np.random.shuffle(file_list)
        for line in file_list:
            # img_name, label
            parts = line.split()
            image_path = parts[0]
            img = Image.open(image_path)
            # img = Image.open(os.path.join(data_dir, image_path))
            if img.mode != 'RGB':
                img = img.convert('RGB')
            label = [int(train_parameters['label_dict'][c]) for c in parts[-1]]
            if len(label) == 0:
                continue
            if mode == 'train':
                img = preprocess(img, input_size)
            img = resize_img(img, input_size)
            img = img.convert('L')
            # img.save(image_path)
            img = np.array(img).astype('float32') - train_parameters['mean_color']
            # img *= 0.007843
            img = img[np.newaxis, ...]
            # print("{0} {1}".format(image_path, label))
            sos = train_parameters['sos']
            eos = train_parameters['eos']
            yield img, [sos] + label, label + [eos]

    return reader


def multi_process_custom_reader(file_path, data_dir, num_workers, input_size, mode):
    file_path = os.path.join(data_dir, file_path)
    readers = []
    images = [line.strip() for line in open(file_path)]
    n = int(math.ceil(len(images) // num_workers))
    image_lists = [images[i: i + n] for i in range(0, len(images), n)]
    train_path = os.path.join(train_parameters['data_dir'], train_parameters['train_dir'])
    for l in image_lists:
        reader = paddle.batch(custom_reader(l, train_path, input_size, mode),
                              batch_size=train_parameters['train_batch_size'])
        readers.append(paddle.reader.shuffle(reader, train_parameters['train_batch_size']))
    return paddle.reader.multiprocess_reader(readers, False)


def create_eval_reader(file_path, data_dir, input_size, mode):
    file_path = os.path.join(data_dir, file_path)
    images = [line.strip() for line in open(file_path)]
    eval_path = os.path.join(train_parameters['data_dir'], train_parameters['eval_dir'])
    return paddle.batch(custom_reader(images, eval_path, input_size, mode),
                        batch_size=train_parameters['train_batch_size'])


def optimizer_sgd_setting():
    batch_size = train_parameters["train_batch_size"]
    iters = train_parameters["image_count"] // batch_size
    learning_strategy = train_parameters['sgd_strategy']
    lr = learning_strategy['learning_rate']

    boundaries = [i * iters for i in learning_strategy["lr_epochs"]]
    values = [i * lr for i in learning_strategy["lr_decay"]]

    optimizer = fluid.optimizer.SGDOptimizer(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
        regularization=fluid.regularizer.L2Decay(0.00005))

    return optimizer


def build_train_program_with_async_reader(main_prog, startup_prog):
    with fluid.program_guard(main_prog, startup_prog):
        img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
        label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
        label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1)
        
        data_reader = fluid.layers.create_py_reader_by_data(capacity=train_parameters['train_batch_size'],
                                                            feed_list=[img, label_in, label_out],
                                                            name='train')
        multi_reader = multi_process_custom_reader(train_parameters['train_list'],
                                                   train_parameters['data_dir'],
                                                   train_parameters['multi_data_reader_count'],
                                                   train_parameters['input_size'],
                                                   'train')
        data_reader.decorate_paddle_reader(multi_reader)
        img, label_in, label_out = fluid.layers.read_file(data_reader)
        loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out)
        return data_reader, loss, distances, seq_num, decoded_out


def build_eval_program_with_feeder(main_prog, startup_prog, place):
    with fluid.program_guard(main_prog, startup_prog):
        img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
        label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
        label_out = fluid.layers.data(name='label_out', shape=[1], dtype='int32', lod_level=1)

        feeder = fluid.DataFeeder(feed_list=[img, label_in, label_out], place=place, program=main_prog)
        reader = create_eval_reader(train_parameters['eval_list'],
                                    train_parameters['data_dir'],
                                    train_parameters['input_size'],
                                    'eval')
        loss, distances, seq_num, decoded_out = get_loss(img, label_in, label_out)
        return reader, loss, distances, seq_num, decoded_out


def get_loss(img, label_in, label_out):
    with fluid.unique_name.guard():
        class_dim = train_parameters['class_dim']
        decoder_size = train_parameters['decoder_size']
        word_vector_dim = train_parameters['word_vector_dim']
        label_dict = train_parameters['label_dict']
        max_char_length = train_parameters['max_char_length']
        model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, label_dict)
        prediction = model.net(img, label_in)

        label_out = fluid.layers.cast(x=label_out, dtype='int64')
        cost = fluid.layers.cross_entropy(input=prediction, label=label_out)
        loss = fluid.layers.reduce_sum(cost)

        optimizer = optimizer_sgd_setting()
        optimizer.minimize(loss)

        _, decoded_out = fluid.layers.topk(input=prediction, k=1)
        casted_label = fluid.layers.cast(x=label_out, dtype='int64')
        sos = train_parameters['sos']
        eos = train_parameters['eos']
        distances, seq_num = fluid.layers.edit_distance(decoded_out, label_out, ignored_tokens=[sos, eos])

        return loss, distances, seq_num, decoded_out


def load_pretrained_params(exe, program):
    if train_parameters['continue_train'] and os.path.exists(train_parameters['save_model_dir']):
        logger.info('load param from retrain model')
        fluid.io.load_persistables(executor=exe,
                                   dirname=train_parameters['save_model_dir'],
                                   main_program=program)
    elif train_parameters['pretrained'] and os.path.exists(train_parameters['pretrained_model_dir']):
        logger.info('load param from pretrained model')

        def if_exist(var):
            return os.path.exists(os.path.join(train_parameters['pretrained_model_dir'], var.name))

        fluid.io.load_vars(exe, train_parameters['pretrained_model_dir'], main_program=program,
                           predicate=if_exist)


def train():
    init_log_config()
    init_train_parameters()
    logger.info("start train attention-ocr, train params:%s", str(train_parameters))

    logger.info("create place, use gpu:" + str(train_parameters['use_gpu']))
    place = fluid.CUDAPlace(0) if train_parameters['use_gpu'] else fluid.CPUPlace()

    logger.info("build network and program")
    train_program = fluid.Program()
    train_start_program = fluid.Program()
    eval_program = fluid.Program()
    eval_start_program = fluid.Program()
    train_reader, loss, distances, seq_num, decoded_out = build_train_program_with_async_reader(train_program,
                                                                                                train_start_program)
    # eval_reader, eval_loss, eval_distances, eval_seq_num, eval_decoded_out = build_eval_program_with_feeder(eval_program, eval_start_program, place)
    # eval_program = eval_program.clone(for_test=True)

    logger.info("build executor and init params")
    exe = fluid.Executor(place)
    exe.run(train_start_program)
    train_fetch_list = [loss.name, distances.name, seq_num.name, decoded_out.name]
    # eval_fetch_list = [output.name]
    load_pretrained_params(exe, train_program)

    stop_strategy = train_parameters['early_stop']
    successive_limit = stop_strategy['successive_limit']
    sample_freq = stop_strategy['sample_frequency']
    min_accuracy = stop_strategy['min_accuracy']
    current_best_accuracy = 0.0
    stop_train = False
    successive_count = 0
    total_batch_count = 0
    distance_evaluator = fluid.metrics.EditDistance("edit-distance")
    for pass_id in range(train_parameters["num_epochs"]):
        logger.info("current pass: %d, start read image", pass_id)
        batch_id = 0
        train_reader.start()
        distance_evaluator.reset()
        try:
            while True:
                t1 = time.time()
                loss, distances, seq_num, decoded_out = exe.run(train_program, fetch_list=train_fetch_list,
                                                                return_numpy=False)
                distances = np.array(distances)
                seq_num = np.array(seq_num)
                distance_evaluator.update(distances, seq_num)
                period = time.time() - t1
                loss = np.mean(np.array(loss))
                batch_id += 1
                total_batch_count += 1

                if batch_id % 10 == 0:
                    distance, instance_error = distance_evaluator.eval()
                    # logger.info(np.array(decoded_out))
                    logger.info("Pass {0}, trainbatch {1}, loss {2} distance {3} instance error {4} time {5}"
                                .format(pass_id, batch_id, loss, distance, instance_error, "%2.2f sec" % period))

        except fluid.core.EOFException:
            train_reader.reset()

        distance, instance_error = distance_evaluator.eval()
        logger.info("Pass {0} distance {1} instance error {2}".format(pass_id, distance, instance_error))
        
        if 1.0 - instance_error >= current_best_accuracy:
            logger.info("temp save pass {0} train result, current bset accuracy {1}".format(pass_id, 1.0 - instance_error))
            current_best_accuracy = 1.0 - instance_error
            fluid.io.save_persistables(dirname=train_parameters['save_model_dir'], main_program=train_program, executor=exe)

    logger.info("training till last epcho, end training")


if __name__ == '__main__':
    train()

`

把模型保存成推理形式的代码: `

# -*- coding: UTF-8 -*-
"""
固化基于attention-ocr的网络,文字行识别
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import numpy as np
import random
import time
import codecs
import sys
import six
import functools
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.param_attr import ParamAttr
from PIL import Image, ImageEnhance


class_dim = 63
decoder_size = 128
word_vector_dim = 128

target_size = [1, 48, 512]
mean_rgb = 127.5
use_gpu = True
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
sos = 0
eos = 1
max_char_length = 40
save_freeze_dir = "./attention-ocr-model"


class AttentionOCR(object):

    def __init__(self, num_classes, decoder_size, word_vector_dim, max_char_length ,label_dict):
        self.outputs = None
        self.decoder_size = decoder_size
        self.word_vector_dim = word_vector_dim
        self.label_dict = label_dict
        self.max_char_length = max_char_length
        self.num_classes = num_classes

    def name(self):
        return 'attention-ocr'

    def conv_bn_pool(self, input, group, out_ch, act="relu", is_test=False, pooling=True, use_cudnn=False):
        tmp = input
        for i in six.moves.xrange(group):
            filter_size = 3
            conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
            conv_param = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, conv_std))
            tmp = fluid.layers.conv2d(
                input=tmp,
                num_filters=out_ch[i],
                filter_size=3,
                padding=1,
                bias_attr=False,
                param_attr=conv_param,
                act=None,  # LinearActivation
                use_cudnn=use_cudnn)
            tmp = fluid.layers.batch_norm(input=tmp, act=act, is_test=is_test)
        if pooling:
            tmp = fluid.layers.pool2d(
                input=tmp,
                pool_size=2,
                pool_type='max',
                pool_stride=2,
                use_cudnn=use_cudnn,
                ceil_mode=True)

        return tmp

    def ocr_convs(self, input, is_test=False, use_cudnn=True):
        tmp = input
        tmp = self.conv_bn_pool(tmp, 2, [16, 16], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [32, 32], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [64, 64], is_test=is_test, use_cudnn=use_cudnn)
        tmp = self.conv_bn_pool(tmp, 2, [128, 128], is_test=is_test, pooling=False, use_cudnn=use_cudnn)

        return tmp

    def encoder_net(self, images, rnn_hidden_size=200, is_test=False, use_cudnn=True):
        conv_features = self.ocr_convs(images, is_test=is_test, use_cudnn=use_cudnn)

        sliced_feature = fluid.layers.im2sequence(
            input=conv_features,
            stride=[1, 1],
            filter_size=[conv_features.shape[2], 1])

        para_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02))
        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(0.0, 0.02), learning_rate=2.0)

        fc_1 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)
        fc_2 = fluid.layers.fc(input=sliced_feature,
                               size=rnn_hidden_size * 3,
                               param_attr=para_attr,
                               bias_attr=False)

        gru_forward = fluid.layers.dynamic_gru(
            input=fc_1,
            size=rnn_hidden_size,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')
        gru_backward = fluid.layers.dynamic_gru(
            input=fc_2,
            size=rnn_hidden_size,
            is_reverse=True,
            param_attr=para_attr,
            bias_attr=bias_attr,
            candidate_activation='relu')

        encoded_vector = fluid.layers.concat(
            input=[gru_forward, gru_backward], axis=1)
        encoded_proj = fluid.layers.fc(input=encoded_vector,
                                       size=self.decoder_size,
                                       bias_attr=False)

        return gru_backward, encoded_vector, encoded_proj

    def gru_decoder_with_attention(self, target_embedding, encoder_vec, encoder_proj, decoder_boot):
        def simple_attention(encoder_vec, encoder_proj, decoder_state):
            decoder_state_proj = fluid.layers.fc(input=decoder_state,
                                                 size=self.decoder_size,
                                                 bias_attr=False)
            decoder_state_expand = fluid.layers.sequence_expand(
                x=decoder_state_proj, y=encoder_proj)
            concated = encoder_proj + decoder_state_expand
            concated = fluid.layers.tanh(x=concated)
            attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
            attention_weights = fluid.layers.sequence_softmax( input=attention_weights)
            weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
            scaled = fluid.layers.elementwise_mul( x=encoder_vec, y=weigths_reshape, axis=0)
            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
            return context

        rnn = fluid.layers.DynamicRNN()

        with rnn.block():
            current_word = rnn.step_input(target_embedding)
            encoder_vec = rnn.static_input(encoder_vec)
            encoder_proj = rnn.static_input(encoder_proj)
            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
            fc_1 = fluid.layers.fc(input=context, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=current_word, size=self.decoder_size * 3, bias_attr=False)
            decoder_inputs = fc_1 + fc_2
            h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=self.decoder_size * 3)
            rnn.update_memory(hidden_mem, h)
            out = fluid.layers.fc(input=h, size=self.num_classes + 2, bias_attr=True, act='softmax')
            rnn.output(out)
        return rnn()

    def net(self, images, label_in):

        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")

        label_in = fluid.layers.cast(x=label_in, dtype='int64')
        trg_embedding = fluid.layers.embedding(
            input=label_in,
            size=[self.num_classes + 2, self.word_vector_dim],
            dtype='float32')
        prediction = self.gru_decoder_with_attention(trg_embedding, encoded_vector,
                                                encoded_proj, decoder_boot)
        return prediction

    def infer(self, images, use_cudnn=True):
        beam_size = 1
        gru_backward, encoded_vector, encoded_proj = self.encoder_net(images, is_test=True, use_cudnn=use_cudnn)

        backward_first = fluid.layers.sequence_pool(input=gru_backward, pool_type='first')
        decoder_boot = fluid.layers.fc(input=backward_first, size=self.decoder_size, 
                                        bias_attr=False, act="relu")
        init_state = decoder_boot
        array_len = fluid.layers.fill_constant(shape=[1], dtype='int64', value=self.max_char_length)
        counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)

        # fill the first element with init_state
        state_array = fluid.layers.create_array('float32')
        fluid.layers.array_write(init_state, array=state_array, i=counter)

        # ids, scores as memory
        ids_array = fluid.layers.create_array('int64')
        scores_array = fluid.layers.create_array('float32')

        init_ids = fluid.layers.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
        init_scores = fluid.layers.data(name="init_scores", shape=[1], dtype="float32", lod_level=2)

        fluid.layers.array_write(init_ids, array=ids_array, i=counter)
        fluid.layers.array_write(init_scores, array=scores_array, i=counter)

        cond = fluid.layers.less_than(x=counter, y=array_len)
        while_op = fluid.layers.While(cond=cond)
        with while_op.block():
            pre_ids = fluid.layers.array_read(array=ids_array, i=counter)
            pre_state = fluid.layers.array_read(array=state_array, i=counter)
            pre_score = fluid.layers.array_read(array=scores_array, i=counter)

            pre_ids_emb = fluid.layers.embedding(
                input=pre_ids,
                size=[self.num_classes + 2, self.word_vector_dim],
                dtype='float32')

            context = self._simple_attention(encoded_vector, encoded_proj, pre_state, self.decoder_size)

            # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
            pre_state_expanded = fluid.layers.sequence_expand(pre_state, pre_score)
            context_expanded = fluid.layers.sequence_expand(context, pre_score)
            fc_1 = fluid.layers.fc(input=context_expanded, size=self.decoder_size * 3, bias_attr=False)
            fc_2 = fluid.layers.fc(input=pre_ids_emb, size=self.decoder_size * 3, bias_attr=False)

            decoder_inputs = fc_1 + fc_2
            current_state, _, _ = fluid.layers.gru_unit(
                input=decoder_inputs,
                hidden=pre_state_expanded,
                size=self.decoder_size * 3)

            current_state_with_lod = fluid.layers.lod_reset(x=current_state, y=pre_score)
            # use score to do beam search
            current_score = fluid.layers.fc(input=current_state_with_lod,
                                            size=self.num_classes + 2,
                                            bias_attr=True,
                                            act='softmax')
            topk_scores, topk_indices = fluid.layers.topk(current_score, k=beam_size)

            # calculate accumulated scores after topk to reduce computation cost
            accu_scores = fluid.layers.elementwise_add(
                x=fluid.layers.log(topk_scores),
                y=fluid.layers.reshape(pre_score, shape=[-1]),
                axis=0)
            selected_ids, selected_scores = fluid.layers.beam_search(
                pre_ids,
                pre_score,
                topk_indices,
                accu_scores,
                beam_size,
                1,  # end_id
                #level=0
            )

            fluid.layers.increment(x=counter, value=1, in_place=True)

            # update the memories
            fluid.layers.array_write(current_state, array=state_array, i=counter)
            fluid.layers.array_write(selected_ids, array=ids_array, i=counter)
            fluid.layers.array_write(selected_scores, array=scores_array, i=counter)

            # update the break condition: up to the max length or all candidates of
            # source sentences have ended.
            length_cond = fluid.layers.less_than(x=counter, y=array_len)
            finish_cond = fluid.layers.logical_not(fluid.layers.is_empty(x=selected_ids))
            fluid.layers.logical_and(x=length_cond, y=finish_cond, out=cond)

        ids, scores = fluid.layers.beam_search_decode(ids_array, scores_array,
                                                      beam_size, eos)
        return ids

    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state, decoder_size):
        decoder_state_proj = fluid.layers.fc(input=decoder_state, size=decoder_size, bias_attr=False)
        decoder_state_expand = fluid.layers.sequence_expand(x=decoder_state_proj, y=encoder_proj)
        concated = fluid.layers.elementwise_add(encoder_proj, decoder_state_expand)
        concated = fluid.layers.tanh(x=concated)
        attention_weights = fluid.layers.fc(input=concated, size=1, act=None, bias_attr=False)
        attention_weights = fluid.layers.sequence_softmax(input=attention_weights)
        weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1])
        scaled = fluid.layers.elementwise_mul(x=encoder_vec, y=weigths_reshape, axis=0)
        context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
        return context
        

def freeze_model():

    exe = fluid.Executor(fluid.CPUPlace())
    image = fluid.layers.data(name='image', shape=target_size, dtype='float32')
    label_in = fluid.layers.data(name='label_in', shape=[1], dtype='int32', lod_level=1)
    model = AttentionOCR(class_dim, decoder_size, word_vector_dim, max_char_length, {})
    pred = model.net(image, label_in)
    out = model.infer(image)

    freeze_program = fluid.default_main_program()
    exe.run(fluid.Program())
    fluid.io.load_persistables(exe, save_freeze_dir, freeze_program)
    freeze_program = freeze_program.clone(for_test=True)
    fluid.io.save_inference_model("./freeze_model", ['image'], out, exe, freeze_program)


if __name__ == '__main__':
    freeze_model()

`

指派人
分配到
无
里程碑
无
分配里程碑
工时统计
无
截止日期
无
标识: paddlepaddle/Paddle#18271
渝ICP备2023009037号

京公网安备11010502055752号

网络110报警服务 Powered by GitLab CE v13.7
开源知识
Git 入门 Pro Git 电子书 在线学 Git
Markdown 基础入门 IT 技术知识开源图谱
帮助
使用手册 反馈建议 博客
《GitCode 隐私声明》 《GitCode 服务条款》 关于GitCode
Powered by GitLab CE v13.7