train.py 18.6 KB
Newer Older
R
root 已提交
1 2 3
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
4 5 6 7
import os
import numpy as np
import time
import sys
R
root 已提交
8 9
import functools
import math
10
import paddle
11
import paddle.fluid as fluid
12
import paddle.dataset.flowers as flowers
13
import reader as reader
14
import argparse
R
ruri 已提交
15 16 17
import functools
import subprocess
import utils
18
import models
T
typhoonzero 已提交
19
from utils.fp16_utils import create_master_params_grads, master_param_to_train_param
20 21
from utils.utility import add_arguments, print_arguments
from utils.learning_rate import cosine_decay_with_warmup
R
root 已提交
22 23

IMAGENET1000 = 1281167
24 25 26

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
27 28 29 30 31 32 33 34 35 36 37 38 39 40
# yapf: disable
add_arg('batch_size',       int,   256,                  "Minibatch size.")
add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
add_arg('total_images',     int,   1281167,              "Training image number.")
add_arg('num_epochs',       int,   120,                  "number of epochs.")
add_arg('class_dim',        int,   1000,                 "Class number.")
add_arg('image_shape',      str,   "3,224,224",          "input image size")
add_arg('model_save_dir',   str,   "output",             "model save directory")
add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
add_arg('lr',               float, 0.1,                  "set learning rate.")
add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
41
add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
M
minqiyang 已提交
42
add_arg('data_dir',         str,   "./data/ILSVRC2012",  "The ImageNet dataset root dir.")
43
add_arg('model_category',   str,   "models_name",        "Whether to use models_name or not, valid value:'models','models_name'." )
T
typhoonzero 已提交
44
add_arg('fp16',             bool,  False,                "Enable half precision training with fp16." )
T
update  
typhoonzero 已提交
45
add_arg('scale_loss',       float, 1.0,                  "Scale loss for fp16." )
R
root 已提交
46 47
add_arg('l2_decay',         float, 1e-4,                 "L2_decay parameter.")
add_arg('momentum_rate',    float, 0.9,                  "momentum_rate.")
48 49 50

def optimizer_setting(params):
    ls = params["learning_strategy"]
R
root 已提交
51 52
    l2_decay = params["l2_decay"]
    momentum_rate = params["momentum_rate"]
53 54
    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
R
root 已提交
55
            total_images = IMAGENET1000
Y
Yibing Liu 已提交
56
        else:
57 58
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
59
        step = int(math.ceil(float(total_images) / batch_size))
60 61 62 63
        bd = [step * e for e in ls["epochs"]]
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
64
        optimizer = fluid.optimizer.Momentum(
65 66
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
R
root 已提交
67 68
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
R
ruri 已提交
69

70 71
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
R
root 已提交
72
            total_images = IMAGENET1000
73 74 75
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
R
root 已提交
76 77
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
78 79 80
	step = int(math.ceil(float(total_images) / batch_size))
        lr = params["lr"]
        num_epochs = params["num_epochs"]
81

82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
        optimizer = fluid.optimizer.Momentum(
            learning_rate=fluid.layers.cosine_decay(
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))

    elif ls["name"] == "cosine_warmup_decay":
        if "total_images" not in params:
            total_images = IMAGENET1000
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
	step = int(math.ceil(float(total_images) / batch_size))
97 98 99
        lr = params["lr"]
        num_epochs = params["num_epochs"]

100
        optimizer = fluid.optimizer.Momentum(
101
            learning_rate=cosine_decay_with_warmup(
102
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
R
root 已提交
103 104
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
105

R
root 已提交
106
    elif ls["name"] == "linear_decay":
R
ruri 已提交
107
        if "total_images" not in params:
R
root 已提交
108
            total_images = IMAGENET1000
R
ruri 已提交
109 110 111 112
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
        num_epochs = params["num_epochs"]
R
root 已提交
113
        start_lr = params["lr"]
R
root 已提交
114 115 116 117 118 119
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
        end_lr = 0
        total_step = int((total_images / batch_size) * num_epochs)
        lr = fluid.layers.polynomial_decay(
            start_lr, total_step, end_lr, power=1)
R
ruri 已提交
120
        optimizer = fluid.optimizer.Momentum(
R
root 已提交
121 122 123
            learning_rate=lr,
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
T
tensor-tang 已提交
124 125 126
    elif ls["name"] == "adam":
        lr = params["lr"]
        optimizer = fluid.optimizer.Adam(learning_rate=lr)
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
    elif ls["name"] == "rmsprop_cosine":
        if "total_images" not in params:
            total_images = IMAGENET1000
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
        step = int(math.ceil(float(total_images) / batch_size))
        lr = params["lr"]
        num_epochs = params["num_epochs"]
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=fluid.layers.cosine_decay(
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay),
            # RMSProp Optimizer: Apply epsilon=1 on ImageNet.
            epsilon=1
        )
146
    else:
147
        lr = params["lr"]
R
root 已提交
148 149
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
150
        optimizer = fluid.optimizer.Momentum(
151
            learning_rate=lr,
R
root 已提交
152 153
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
154

155
    return optimizer
156

R
ruri 已提交
157 158
def net_config(image, label, model, args):
    model_list = [m for m in dir(models) if "__" not in m]
R
root 已提交
159 160
    assert args.model in model_list, "{} is not lists: {}".format(args.model,
                                                                  model_list)
161

162 163 164
    class_dim = args.class_dim
    model_name = args.model

165 166
    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"
D
Dang Qingqing 已提交
167
        model.params["dropout_seed"] = 100
R
root 已提交
168
        class_dim = 102
169

R
root 已提交
170
    if model_name == "GoogleNet":
171 172 173 174 175 176 177 178 179 180 181
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
Y
Yibing Liu 已提交
182
    else:
R
root 已提交
183 184 185
        out = model.net(input=image, class_dim=class_dim)
        cost, pred = fluid.layers.softmax_with_cross_entropy(
            out, label, return_softmax=True)
T
typhoonzero 已提交
186
        if args.scale_loss > 1:
T
update  
typhoonzero 已提交
187
            avg_cost = fluid.layers.mean(x=cost) * float(args.scale_loss)
T
typhoonzero 已提交
188
        else:
T
update  
typhoonzero 已提交
189
            avg_cost = fluid.layers.mean(x=cost)
190

T
update  
typhoonzero 已提交
191 192
        acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
193

R
ruri 已提交
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
    return avg_cost, acc_top1, acc_top5


def build_program(is_train, main_prog, startup_prog, args):
    image_shape = [int(m) for m in args.image_shape.split(",")]
    model_name = args.model
    model_list = [m for m in dir(models) if "__" not in m]
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    model = models.__dict__[model_name]()
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=16,
            shapes=[[-1] + image_shape, [-1, 1]],
            lod_levels=[0, 0],
            dtypes=["float32", "int64"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, label = fluid.layers.read_file(py_reader)
T
typhoonzero 已提交
213
            if args.fp16:
T
update  
typhoonzero 已提交
214
                image = fluid.layers.cast(image, "float16")
R
ruri 已提交
215 216 217 218 219 220 221 222 223 224 225
            avg_cost, acc_top1, acc_top5 = net_config(image, label, model, args)
            avg_cost.persistable = True
            acc_top1.persistable = True
            acc_top5.persistable = True
            if is_train:
                params = model.params
                params["total_images"] = args.total_images
                params["lr"] = args.lr
                params["num_epochs"] = args.num_epochs
                params["learning_strategy"]["batch_size"] = args.batch_size
                params["learning_strategy"]["name"] = args.lr_strategy
R
root 已提交
226 227
                params["l2_decay"] = args.l2_decay
                params["momentum_rate"] = args.momentum_rate
R
ruri 已提交
228 229

                optimizer = optimizer_setting(params)
T
typhoonzero 已提交
230
                if args.fp16:
T
typhoonzero 已提交
231
                    params_grads = optimizer.backward(avg_cost)
T
typhoonzero 已提交
232 233
                    master_params_grads = create_master_params_grads(
                        params_grads, main_prog, startup_prog, args.scale_loss)
T
update  
typhoonzero 已提交
234
                    optimizer.apply_gradients(master_params_grads)
R
root 已提交
235 236
                    master_param_to_train_param(master_params_grads,
                                                params_grads, main_prog)
T
typhoonzero 已提交
237 238
                else:
                    optimizer.minimize(avg_cost)
R
root 已提交
239
                global_lr = optimizer._global_learning_rate()
R
ruri 已提交
240

R
root 已提交
241 242 243 244
    if is_train:
        return py_reader, avg_cost, acc_top1, acc_top5, global_lr
    else:
        return py_reader, avg_cost, acc_top1, acc_top5
R
ruri 已提交
245

246 247 248 249 250 251 252
def get_device_num():
    visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
    if visible_device:
        device_num = len(visible_device.split(','))
    else:
        device_num = subprocess.check_output(['nvidia-smi','-L']).decode().count('\n')
    return device_num
R
ruri 已提交
253 254 255 256 257 258 259 260

def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
261

R
ruri 已提交
262 263 264 265 266 267 268
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000

R
root 已提交
269
    train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program(
R
ruri 已提交
270 271 272 273 274 275 276 277 278 279
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    test_py_reader, test_cost, test_acc1, test_acc5 = build_program(
        is_train=False,
        main_prog=test_prog,
        startup_prog=startup_prog,
        args=args)
    test_prog = test_prog.clone(for_test=True)
280

281
    if with_memory_optimization:
R
ruri 已提交
282 283
        fluid.memory_optimize(train_prog)
        fluid.memory_optimize(test_prog)
284

285
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
286
    exe = fluid.Executor(place)
R
ruri 已提交
287
    exe.run(startup_prog)
288

289
    if checkpoint is not None:
R
ruri 已提交
290
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
291

292 293 294 295 296
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

R
ruri 已提交
297 298
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
299

T
tensor-tang 已提交
300
    if args.use_gpu:
301
        device_num = get_device_num()
R
ruri 已提交
302
    else:
T
tensor-tang 已提交
303
        device_num = 1
R
ruri 已提交
304
    train_batch_size = args.batch_size / device_num
T
tensor-tang 已提交
305

K
kolinwei 已提交
306
    test_batch_size = 16
307
    if not args.enable_ce:
R
ruri 已提交
308 309
        train_reader = paddle.batch(
            reader.train(), batch_size=train_batch_size, drop_last=True)
310 311 312 313 314 315
        test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
D
Dang Qingqing 已提交
316
        np.random.seed(0)
317
        train_reader = paddle.batch(
R
ruri 已提交
318 319 320
            flowers.train(use_xmap=False),
            batch_size=train_batch_size,
            drop_last=True)
321 322 323
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

R
ruri 已提交
324 325
    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)
T
tensor-tang 已提交
326 327 328 329 330 331 332 333 334

    use_ngraph = os.getenv('FLAGS_use_ngraph')
    if not use_ngraph:
        train_exe = fluid.ParallelExecutor(
            main_program=train_prog,
            use_cuda=bool(args.use_gpu),
            loss_name=train_cost.name)
    else:
        train_exe = exe
R
ruri 已提交
335

R
root 已提交
336 337 338
    train_fetch_list = [
        train_cost.name, train_acc1.name, train_acc5.name, global_lr.name
    ]
R
ruri 已提交
339
    test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]
340

R
ruri 已提交
341
    params = models.__dict__[args.model]().params
342
    for pass_id in range(params["num_epochs"]):
R
ruri 已提交
343 344 345

        train_py_reader.start()

346 347
        train_info = [[], [], []]
        test_info = [[], [], []]
348
        train_time = []
R
ruri 已提交
349 350 351 352
        batch_id = 0
        try:
            while True:
                t1 = time.time()
R
root 已提交
353

T
tensor-tang 已提交
354 355 356 357 358 359
                if use_ngraph:
                    loss, acc1, acc5, lr = train_exe.run(
                        train_prog, fetch_list=train_fetch_list)
                else:
                    loss, acc1, acc5, lr = train_exe.run(
                        fetch_list=train_fetch_list)
R
ruri 已提交
360 361 362 363 364 365 366 367
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(np.array(loss))
                acc1 = np.mean(np.array(acc1))
                acc5 = np.mean(np.array(acc5))
                train_info[0].append(loss)
                train_info[1].append(acc1)
                train_info[2].append(acc5)
R
root 已提交
368
                lr = np.mean(np.array(lr))
R
ruri 已提交
369
                train_time.append(period)
R
root 已提交
370

R
ruri 已提交
371 372
                if batch_id % 10 == 0:
                    print("Pass {0}, trainbatch {1}, loss {2}, \
373 374
                        acc1 {3}, acc5 {4}, lr {5}, time {6}"
                          .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" %
R
root 已提交
375
                                  lr, "%2.2f sec" % period))
R
ruri 已提交
376 377 378 379
                    sys.stdout.flush()
                batch_id += 1
        except fluid.core.EOFException:
            train_py_reader.reset()
380 381 382 383

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
R
root 已提交
384 385
        train_speed = np.array(train_time).mean() / (train_batch_size *
                                                     device_num)
R
ruri 已提交
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405

        test_py_reader.start()

        test_batch_id = 0
        try:
            while True:
                t1 = time.time()
                loss, acc1, acc5 = exe.run(program=test_prog,
                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
                test_info[0].append(loss)
                test_info[1].append(acc1)
                test_info[2].append(acc5)
                if test_batch_id % 10 == 0:
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5}"
406
                          .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5,
R
ruri 已提交
407 408 409 410 411 412 413 414 415
                                  "%2.2f sec" % period))
                    sys.stdout.flush()
                test_batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()

        test_loss = np.array(test_info[0]).mean()
        test_acc1 = np.array(test_info[1]).mean()
        test_acc5 = np.array(test_info[2]).mean()
416

417
        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
R
ruri 已提交
418
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
419 420
                  pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss,
                  "%.5f"%test_acc1, "%.5f"%test_acc5))
421 422
        sys.stdout.flush()

423
        model_path = os.path.join(model_save_dir + '/' + model_name,
424
                                  str(pass_id))
425 426
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
R
ruri 已提交
427
        fluid.io.save_persistables(exe, model_path, main_program=train_prog)
428

429 430
        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
R
ruri 已提交
431
            if device_num == 1:
D
Dang Qingqing 已提交
432
                # Use the mean cost/acc for training
433 434 435 436 437 438 439 440 441
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
                print("kpis	train_acc_top5	%s" % train_acc5)
                # Use the mean cost/acc for testing
                print("kpis	test_cost	%s" % test_loss)
                print("kpis	test_acc_top1	%s" % test_acc1)
                print("kpis	test_acc_top5	%s" % test_acc5)
                print("kpis	train_speed	%s" % train_speed)
            else:
D
Dang Qingqing 已提交
442
                # Use the mean cost/acc for training
R
ruri 已提交
443 444 445 446 447
                print("kpis	train_cost_card%s	%s" % (device_num, train_loss))
                print("kpis	train_acc_top1_card%s	%s" %
                      (device_num, train_acc1))
                print("kpis	train_acc_top5_card%s	%s" %
                      (device_num, train_acc5))
448
                # Use the mean cost/acc for testing
R
ruri 已提交
449 450 451 452
                print("kpis	test_cost_card%s	%s" % (device_num, test_loss))
                print("kpis	test_acc_top1_card%s	%s" % (device_num, test_acc1))
                print("kpis	test_acc_top5_card%s	%s" % (device_num, test_acc5))
                print("kpis	train_speed_card%s	%s" % (device_num, train_speed))
453

454

455
def main():
456 457
    args = parser.parse_args()
    print_arguments(args)
458
    train(args)
459

460 461 462

if __name__ == '__main__':
    main()