train.py 8.9 KB
Newer Older
1 2 3 4
import os
import numpy as np
import time
import sys
5
import paddle
6
import paddle.fluid as fluid
7
import models
8 9 10
import reader
import argparse
import functools
11
from models.learning_rate import cosine_decay
12
from utility import add_arguments, print_arguments
13
import math
14 15 16

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
# yapf: disable
add_arg('batch_size',       int,   256,                  "Minibatch size.")
add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
add_arg('total_images',     int,   1281167,              "Training image number.")
add_arg('num_epochs',       int,   120,                  "number of epochs.")
add_arg('class_dim',        int,   1000,                 "Class number.")
add_arg('image_shape',      str,   "3,224,224",          "input image size")
add_arg('model_save_dir',   str,   "output",             "model save directory")
add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
add_arg('lr',               float, 0.1,                  "set learning rate.")
add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
# yapf: enable

model_list = [m for m in dir(models) if "__" not in m]


def optimizer_setting(params):
    ls = params["learning_strategy"]

    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
            total_images = 1281167
Y
Yibing Liu 已提交
42
        else:
43
            total_images = params["total_images"]
44

45 46
        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)
D
Dang Qingqing 已提交
47

48 49 50 51
        bd = [step * e for e in ls["epochs"]]
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
52
        optimizer = fluid.optimizer.Momentum(
53 54 55 56
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
57 58 59 60 61 62 63 64 65 66 67 68
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
            total_images = 1281167
        else:
            total_images = params["total_images"]

        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)

        lr = params["lr"]
        num_epochs = params["num_epochs"]

69 70
        optimizer = fluid.optimizer.Momentum(
            learning_rate=cosine_decay(
71
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
72 73 74
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    else:
75
        lr = params["lr"]
76
        optimizer = fluid.optimizer.Momentum(
77
            learning_rate=lr,
78 79 80
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))

81
    return optimizer
82 83


84 85 86 87 88 89 90 91 92
def train(args):
    # parameters from arguments
    class_dim = args.class_dim
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
    image_shape = [int(m) for m in args.image_shape.split(",")]
D
Dang Qingqing 已提交
93

94 95
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
96 97 98

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114

    # model definition
    model = models.__dict__[model_name]()

    if model_name is "GoogleNet":
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
Y
Yibing Liu 已提交
115
    else:
116 117
        out = model.net(input=image, class_dim=class_dim)
        cost = fluid.layers.cross_entropy(input=out, label=label)
118

119 120 121
        avg_cost = fluid.layers.mean(x=cost)
        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
122 123 124

    test_program = fluid.default_main_program().clone(for_test=True)

125 126 127 128 129 130 131
    # parameters from model and arguments
    params = model.params
    params["total_images"] = args.total_images
    params["lr"] = args.lr
    params["num_epochs"] = args.num_epochs
    params["learning_strategy"]["batch_size"] = args.batch_size
    params["learning_strategy"]["name"] = args.lr_strategy
132

133 134
    # initialize optimizer
    optimizer = optimizer_setting(params)
135 136
    opts = optimizer.minimize(avg_cost)

137
    if with_memory_optimization:
138 139
        fluid.memory_optimize(fluid.default_main_program())

140
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
141 142 143
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

144 145
    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint)
146

147 148 149 150 151 152 153
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

154 155 156 157
    train_batch_size = args.batch_size
    test_batch_size = 16
    train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
    test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
158 159 160 161 162 163
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)

    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]

164
    for pass_id in range(params["num_epochs"]):
165 166 167 168
        train_info = [[], [], []]
        test_info = [[], [], []]
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
Y
Yibing Liu 已提交
169
            loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)
            if batch_id % 10 == 0:
                print("Pass {0}, trainbatch {1}, loss {2}, \
                       acc1 {3}, acc5 {4} time {5}"
                                                   .format(pass_id, \
                       batch_id, loss, acc1, acc5, \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
189 190
        cnt = 0
        for test_batch_id, data in enumerate(test_reader()):
191
            t1 = time.time()
192 193 194
            loss, acc1, acc5 = exe.run(test_program,
                                       fetch_list=fetch_list,
                                       feed=feeder.feed(data))
195 196
            t2 = time.time()
            period = t2 - t1
197 198 199 200 201 202 203 204
            loss = np.mean(loss)
            acc1 = np.mean(acc1)
            acc5 = np.mean(acc5)
            test_info[0].append(loss * len(data))
            test_info[1].append(acc1 * len(data))
            test_info[2].append(acc5 * len(data))
            cnt += len(data)
            if test_batch_id % 10 == 0:
205 206 207
                print("Pass {0},testbatch {1},loss {2}, \
                       acc1 {3},acc5 {4},time {5}"
                                                  .format(pass_id, \
208
                       test_batch_id, loss, acc1, acc5, \
209 210 211
                       "%2.2f sec" % period))
                sys.stdout.flush()

212 213 214
        test_loss = np.sum(test_info[0]) / cnt
        test_acc1 = np.sum(test_info[1]) / cnt
        test_acc5 = np.sum(test_info[2]) / cnt
215

216 217
        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \
218 219 220 221
              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
              test_acc5))
        sys.stdout.flush()

222
        model_path = os.path.join(model_save_dir + '/' + model_name,
223
                                  str(pass_id))
224 225 226 227 228
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)


229
def main():
230 231
    args = parser.parse_args()
    print_arguments(args)
232
    train(args)
233

234 235 236

if __name__ == '__main__':
    main()