train.py 11.5 KB
Newer Older
1 2 3 4
import os
import numpy as np
import time
import sys
5
import paddle
6
import paddle.fluid as fluid
7
import paddle.dataset.flowers as flowers
8
import models
9 10 11
import reader
import argparse
import functools
12
from models.learning_rate import cosine_decay
13
from utility import add_arguments, print_arguments
14
import math
15 16 17

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
18 19 20 21 22 23 24 25 26 27 28 29 30 31
# yapf: disable
add_arg('batch_size',       int,   256,                  "Minibatch size.")
add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
add_arg('total_images',     int,   1281167,              "Training image number.")
add_arg('num_epochs',       int,   120,                  "number of epochs.")
add_arg('class_dim',        int,   1000,                 "Class number.")
add_arg('image_shape',      str,   "3,224,224",          "input image size")
add_arg('model_save_dir',   str,   "output",             "model save directory")
add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
add_arg('lr',               float, 0.1,                  "set learning rate.")
add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
32
add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
33 34 35 36 37 38 39 40 41 42 43
# yapf: enable

model_list = [m for m in dir(models) if "__" not in m]


def optimizer_setting(params):
    ls = params["learning_strategy"]

    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
            total_images = 1281167
Y
Yibing Liu 已提交
44
        else:
45
            total_images = params["total_images"]
46

47 48
        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)
D
Dang Qingqing 已提交
49

50 51 52 53
        bd = [step * e for e in ls["epochs"]]
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
54
        optimizer = fluid.optimizer.Momentum(
55 56 57 58
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
59 60 61 62 63 64 65 66 67 68 69 70
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
            total_images = 1281167
        else:
            total_images = params["total_images"]

        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)

        lr = params["lr"]
        num_epochs = params["num_epochs"]

71 72
        optimizer = fluid.optimizer.Momentum(
            learning_rate=cosine_decay(
73
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
74 75 76
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    else:
77
        lr = params["lr"]
78
        optimizer = fluid.optimizer.Momentum(
79
            learning_rate=lr,
80 81 82
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))

83
    return optimizer
84 85


86 87 88 89 90 91 92 93 94
def train(args):
    # parameters from arguments
    class_dim = args.class_dim
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
    image_shape = [int(m) for m in args.image_shape.split(",")]
D
Dang Qingqing 已提交
95

96 97
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
98 99 100

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
101 102 103 104

    # model definition
    model = models.__dict__[model_name]()

105 106 107
    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"

108 109 110 111 112 113 114 115 116 117 118 119
    if model_name is "GoogleNet":
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
Y
Yibing Liu 已提交
120
    else:
121 122
        out = model.net(input=image, class_dim=class_dim)
        cost = fluid.layers.cross_entropy(input=out, label=label)
123

124 125 126
        avg_cost = fluid.layers.mean(x=cost)
        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
127 128 129

    test_program = fluid.default_main_program().clone(for_test=True)

130 131 132 133 134 135 136
    # parameters from model and arguments
    params = model.params
    params["total_images"] = args.total_images
    params["lr"] = args.lr
    params["num_epochs"] = args.num_epochs
    params["learning_strategy"]["batch_size"] = args.batch_size
    params["learning_strategy"]["name"] = args.lr_strategy
137 138
    if args.enable_ce:
        params["dropout_seed"] = 10
139

140 141
    # initialize optimizer
    optimizer = optimizer_setting(params)
142 143
    opts = optimizer.minimize(avg_cost)

144
    if with_memory_optimization:
145 146
        fluid.memory_optimize(fluid.default_main_program())

147 148 149
    if args.enable_ce:
        fluid.default_startup_program().random_seed = 1000

150
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
151 152 153
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

154 155
    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint)
156

157 158 159 160 161 162 163
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

164 165
    train_batch_size = args.batch_size
    test_batch_size = 16
166 167 168 169 170 171 172 173 174 175 176 177 178 179

    if not args.enable_ce:
        train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
        test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
        train_reader = paddle.batch(
            flowers.train(use_xmap=False), batch_size=train_batch_size)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

180 181
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

L
Luo Tao 已提交
182 183
    train_exe = fluid.ParallelExecutor(
        use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name)
184 185 186

    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]

187 188
    gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    gpu_nums = len(gpu.split(","))
189
    for pass_id in range(params["num_epochs"]):
190 191
        train_info = [[], [], []]
        test_info = [[], [], []]
192
        train_time = []
193 194
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
Y
Yibing Liu 已提交
195
            loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
196 197 198 199 200 201 202 203
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)
204
            train_time.append(period)
205 206 207 208 209 210 211 212 213 214 215
            if batch_id % 10 == 0:
                print("Pass {0}, trainbatch {1}, loss {2}, \
                       acc1 {3}, acc5 {4} time {5}"
                                                   .format(pass_id, \
                       batch_id, loss, acc1, acc5, \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
216
        train_speed = np.array(train_time).mean() / train_batch_size
217 218
        cnt = 0
        for test_batch_id, data in enumerate(test_reader()):
219
            t1 = time.time()
220 221 222
            loss, acc1, acc5 = exe.run(test_program,
                                       fetch_list=fetch_list,
                                       feed=feeder.feed(data))
223 224
            t2 = time.time()
            period = t2 - t1
225 226 227 228 229 230 231 232
            loss = np.mean(loss)
            acc1 = np.mean(acc1)
            acc5 = np.mean(acc5)
            test_info[0].append(loss * len(data))
            test_info[1].append(acc1 * len(data))
            test_info[2].append(acc5 * len(data))
            cnt += len(data)
            if test_batch_id % 10 == 0:
233 234 235
                print("Pass {0},testbatch {1},loss {2}, \
                       acc1 {3},acc5 {4},time {5}"
                                                  .format(pass_id, \
236
                       test_batch_id, loss, acc1, acc5, \
237 238 239
                       "%2.2f sec" % period))
                sys.stdout.flush()

240 241 242
        test_loss = np.sum(test_info[0]) / cnt
        test_acc1 = np.sum(test_info[1]) / cnt
        test_acc5 = np.sum(test_info[2]) / cnt
243

244 245
        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \
246 247 248 249
              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
              test_acc5))
        sys.stdout.flush()

250
        model_path = os.path.join(model_save_dir + '/' + model_name,
251
                                  str(pass_id))
252 253 254 255
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)

256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
            if gpu_nums == 1:
                # Use the last cost/acc for training
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
                print("kpis	train_acc_top5	%s" % train_acc5)
                # Use the mean cost/acc for testing
                print("kpis	test_cost	%s" % test_loss)
                print("kpis	test_acc_top1	%s" % test_acc1)
                print("kpis	test_acc_top5	%s" % test_acc5)
                print("kpis	train_speed	%s" % train_speed)
            else:
                # Use the last cost/acc for training
                print("kpis    train_cost_card%s       %s" %
                      (gpu_nums, train_loss))
                print("kpis    train_acc_top1_card%s   %s" %
                      (gpu_nums, train_acc1))
                print("kpis    train_acc_top5_card%s   %s" %
                      (gpu_nums, train_acc5))
                # Use the mean cost/acc for testing
                print("kpis    test_cost_card%s        %s" %
                      (gpu_nums, test_loss))
                print("kpis    test_acc_top1_card%s    %s" %
                      (gpu_nums, test_acc1))
                print("kpis    test_acc_top5_card%s    %s" %
                      (gpu_nums, test_acc5))
                print("kpis    train_speed_card%s      %s" %
                      (gpu_nums, train_speed))

286

287
def main():
288 289
    args = parser.parse_args()
    print_arguments(args)
290
    train(args)
291

292 293 294

if __name__ == '__main__':
    main()