train.py

"""Trainer for OCR CTC model."""
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import numpy as np
import dummy_reader
import argparse
import functools
from paddle.v2.fluid import core
from utility import add_arguments, print_arguments

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size',     int,   16,     "Minibatch size.")
add_arg('pass_num',       int,   16,     "# of training epochs.")
add_arg('learning_rate',  float, 1.0e-3, "Learning rate.")
add_arg('l2',             float, 0.0005, "L2 regularizer.")
add_arg('max_clip',       float, 10.0,   "Max clip threshold.")
add_arg('min_clip',       float, -10.0,  "Min clip threshold.")
add_arg('momentum',       float, 0.9,    "Momentum.")
add_arg('device',         int,   -1,     "Device id.'-1' means running on CPU"
                                         "while '0' means GPU-0.")
# yapf: disable
def _to_lodtensor(data, place):
    seq_lens = [len(seq) for seq in data]
    cur_len = 0
    lod = [cur_len]
    for l in seq_lens:
        cur_len += l
        lod.append(cur_len)
    flattened_data = np.concatenate(data, axis=0).astype("int32")
    flattened_data = flattened_data.reshape([len(flattened_data), 1])
    res = core.LoDTensor()
    res.set(flattened_data, place)
    res.set_lod([lod])
    return res

def _get_feeder_data(data, place):
    pixel_tensor = core.LoDTensor()
    pixel_data = np.concatenate(
        map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32")
    pixel_tensor.set(pixel_data, place)
    label_tensor = _to_lodtensor(map(lambda x: x[1], data), place)
    return {"pixel": pixel_tensor, "label": label_tensor}

def _ocr_conv(input, num, with_bn, param_attrs):
    assert (num % 4 == 0)

    def _conv_block(input, filter_size, group_size, with_bn):
        return fluid.nets.img_conv_group(
            input=input,
            conv_num_filter=[filter_size] * group_size,
            pool_size=2,
            pool_stride=2,
            conv_padding=1,
            conv_filter_size=3,
            conv_act='relu',
            conv_with_batchnorm=with_bn,
            pool_type='max',
            param_attr=param_attrs)

    conv1 = _conv_block(input, 16, (num / 4), with_bn)
    conv2 = _conv_block(conv1, 32, (num / 4), with_bn)
    conv3 = _conv_block(conv2, 64, (num / 4), with_bn)
    conv4 = _conv_block(conv3, 128, (num / 4), with_bn)
    return conv4


def _ocr_ctc_net(images, num_classes, param_attrs):
    conv_features = _ocr_conv(images, 8, True, param_attrs)
    sliced_feature = fluid.layers.im2sequence(
        input=conv_features, stride=[1, 1], filter_size=[1, 3])
    gru_forward = fluid.layers.dynamic_gru(
        input=sliced_feature, size=128, param_attr=param_attrs)
    gru_backward = fluid.layers.dynamic_gru(
        input=sliced_feature, size=128, is_reverse=True, param_attr=param_attrs)

    fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
                             size=num_classes + 1,
                             param_attr=param_attrs)
    return fc_out


def train(l2=0.0005,
          min_clip=-10,
          max_clip=10,
          data_reader=dummy_reader,
          learning_rate=1.0e-3,
          momentum=0.9,
          batch_size=16,
          pass_num=2,
          device=0):
    """OCR CTC training"""
    num_classes = data_reader.num_classes()
    # define network
    param_attrs = fluid.ParamAttr(
        regularizer=fluid.regularizer.L2Decay(l2 * batch_size),
        gradient_clip=fluid.clip.GradientClipByValue(max_clip, min_clip))
    data_shape = data_reader.data_shape()
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(
        name='label', shape=[1], dtype='int32', lod_level=1)
    fc_out = _ocr_ctc_net(images, num_classes, param_attrs)

    # define cost and optimizer
    cost = fluid.layers.warpctc(
        input=fc_out,
        label=label,
        size=num_classes + 1,
        blank=num_classes,
        norm_by_times=True)
    avg_cost = fluid.layers.mean(x=cost)
    optimizer = fluid.optimizer.Momentum(
        learning_rate=learning_rate / batch_size, momentum=momentum)
    opts = optimizer.minimize(cost)

    # decoder and evaluator
    decoded_out = fluid.layers.ctc_greedy_decoder(
        input=fc_out, blank=num_classes)
    casted_label = fluid.layers.cast(x=label, dtype='int64')
    error_evaluator = fluid.evaluator.EditDistance(
        input=decoded_out, label=casted_label)
    # data reader
    train_reader = paddle.batch(data_reader.train(), batch_size=batch_size)
    test_reader = paddle.batch(data_reader.test(), batch_size=batch_size)
    # prepare environment
    place = fluid.CPUPlace()
    if device >= 0:
        place = fluid.CUDAPlace(device)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    inference_program = fluid.io.get_inference_program(error_evaluator)
    for pass_id in range(pass_num):
        error_evaluator.reset(exe)
        batch_id = 0
        # train a pass
        for data in train_reader():
            loss, batch_edit_distance, _, _ = exe.run(
                fluid.default_main_program(),
                feed=_get_feeder_data(data, place),
                fetch_list=[avg_cost] + error_evaluator.metrics)
            print "Pass[%d], batch[%d]; loss: %s; edit distance: %s." % (
                pass_id, batch_id, loss[0], batch_edit_distance[0])
            batch_id += 1

        train_edit_distance = error_evaluator.eval(exe)
        print "End pass[%d]; train data edit_distance: %s." % (
            pass_id, str(train_edit_distance[0]))

        # evaluate model on test data
        error_evaluator.reset(exe)
        for data in test_reader():
            exe.run(inference_program, feed=_get_feeder_data(data, place))
        test_edit_distance = error_evaluator.eval(exe)
        print "End pass[%d]; test data edit_distance: %s." % (
            pass_id, str(test_edit_distance[0]))


def main():
    args = parser.parse_args()
    print_arguments(args)
    train(l2=args.l2,
          min_clip=args.min_clip,
          max_clip=args.max_clip,
          learning_rate=args.learning_rate,
          momentum=args.momentum,
          batch_size=args.batch_size,
          pass_num=args.pass_num,
          device=args.device)

if __name__ == "__main__":
    main()