train.py 26.2 KB
Newer Older
R
ruri 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

R
root 已提交
15 16 17
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
R
ruri 已提交
18

19 20 21 22
import os
import numpy as np
import time
import sys
R
root 已提交
23 24
import functools
import math
25

26

27 28 29 30 31 32 33 34 35 36 37 38 39
def set_paddle_flags(flags):
    for key, value in flags.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags({
    'FLAGS_eager_delete_tensor_gb': 0,  # enable gc 
    'FLAGS_fraction_of_gpu_memory_to_use': 0.98
})
R
ruri 已提交
40 41 42
import argparse
import functools
import subprocess
43
import paddle
44
import paddle.fluid as fluid
45
import paddle.dataset.flowers as flowers
46
import reader_cv2 as reader
R
ruri 已提交
47
import utils
48
import models
T
typhoonzero 已提交
49
from utils.fp16_utils import create_master_params_grads, master_param_to_train_param
R
ruri 已提交
50
from utils.utility import add_arguments, print_arguments, check_gpu
51
from utils.learning_rate import cosine_decay_with_warmup
52
from dist_train import dist_utils
R
root 已提交
53 54

IMAGENET1000 = 1281167
55
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
56 57 58

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
59

60 61 62 63 64 65 66 67
# yapf: disable
add_arg('batch_size',       int,   256,                  "Minibatch size.")
add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
add_arg('total_images',     int,   1281167,              "Training image number.")
add_arg('num_epochs',       int,   120,                  "number of epochs.")
add_arg('class_dim',        int,   1000,                 "Class number.")
add_arg('image_shape',      str,   "3,224,224",          "input image size")
add_arg('model_save_dir',   str,   "output",             "model save directory")
68
add_arg('with_mem_opt',     bool,  False,                 "Whether to use memory optimization or not.")
69
add_arg('with_inplace',     bool,  True,                 "Whether to use inplace memory optimization.")
70 71 72 73
add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
add_arg('lr',               float, 0.1,                  "set learning rate.")
add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
74
add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
75
add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
76
add_arg('data_dir',         str,   "./data/ILSVRC2012/",  "The ImageNet dataset root dir.")
T
typhoonzero 已提交
77
add_arg('fp16',             bool,  False,                "Enable half precision training with fp16." )
T
update  
typhoonzero 已提交
78
add_arg('scale_loss',       float, 1.0,                  "Scale loss for fp16." )
R
root 已提交
79 80
add_arg('l2_decay',         float, 1e-4,                 "L2_decay parameter.")
add_arg('momentum_rate',    float, 0.9,                  "momentum_rate.")
81 82 83 84 85 86 87 88 89
add_arg('use_label_smoothing',      bool,      False,        "Whether to use label_smoothing or not")
add_arg('label_smoothing_epsilon',      float,     0.2,      "Set the label_smoothing_epsilon parameter")
add_arg('lower_scale',      float,     0.08,      "Set the lower_scale in ramdom_crop")
add_arg('lower_ratio',      float,     3./4.,      "Set the lower_ratio in ramdom_crop")
add_arg('upper_ratio',      float,     4./3.,      "Set the upper_ratio in ramdom_crop")
add_arg('resize_short_size',      int,     256,      "Set the resize_short_size")
add_arg('use_mixup',      bool,      False,        "Whether to use mixup or not")
add_arg('mixup_alpha',      float,     0.2,      "Set the mixup_alpha parameter")
add_arg('is_distill',       bool,  False,        "is distill or not")
90 91 92

def optimizer_setting(params):
    ls = params["learning_strategy"]
R
root 已提交
93 94
    l2_decay = params["l2_decay"]
    momentum_rate = params["momentum_rate"]
95 96
    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
R
root 已提交
97
            total_images = IMAGENET1000
Y
Yibing Liu 已提交
98
        else:
99 100
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
101
        step = int(math.ceil(float(total_images) / batch_size))
102 103 104 105
        bd = [step * e for e in ls["epochs"]]
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
106
        optimizer = fluid.optimizer.Momentum(
107 108
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
R
root 已提交
109 110
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
R
ruri 已提交
111

112 113
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
R
root 已提交
114
            total_images = IMAGENET1000
115 116 117
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
R
root 已提交
118 119
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
S
shippingwang 已提交
120
        step = int(math.ceil(float(total_images) / batch_size))
121 122
        lr = params["lr"]
        num_epochs = params["num_epochs"]
123

124 125 126 127 128 129 130 131 132 133 134 135 136 137
        optimizer = fluid.optimizer.Momentum(
            learning_rate=fluid.layers.cosine_decay(
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))

    elif ls["name"] == "cosine_warmup_decay":
        if "total_images" not in params:
            total_images = IMAGENET1000
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
S
shippingwang 已提交
138
        step = int(math.ceil(float(total_images) / batch_size))
139 140 141
        lr = params["lr"]
        num_epochs = params["num_epochs"]

142
        optimizer = fluid.optimizer.Momentum(
143
            learning_rate=cosine_decay_with_warmup(
144
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
R
root 已提交
145 146
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
147

R
root 已提交
148
    elif ls["name"] == "linear_decay":
R
ruri 已提交
149
        if "total_images" not in params:
R
root 已提交
150
            total_images = IMAGENET1000
R
ruri 已提交
151 152 153 154
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
        num_epochs = params["num_epochs"]
R
root 已提交
155
        start_lr = params["lr"]
R
root 已提交
156 157 158 159 160 161
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
        end_lr = 0
        total_step = int((total_images / batch_size) * num_epochs)
        lr = fluid.layers.polynomial_decay(
            start_lr, total_step, end_lr, power=1)
R
ruri 已提交
162
        optimizer = fluid.optimizer.Momentum(
R
root 已提交
163 164 165
            learning_rate=lr,
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
T
tensor-tang 已提交
166 167 168
    elif ls["name"] == "adam":
        lr = params["lr"]
        optimizer = fluid.optimizer.Adam(learning_rate=lr)
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
    elif ls["name"] == "rmsprop_cosine":
        if "total_images" not in params:
            total_images = IMAGENET1000
        else:
            total_images = params["total_images"]
        batch_size = ls["batch_size"]
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
        step = int(math.ceil(float(total_images) / batch_size))
        lr = params["lr"]
        num_epochs = params["num_epochs"]
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=fluid.layers.cosine_decay(
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay),
            # RMSProp Optimizer: Apply epsilon=1 on ImageNet.
            epsilon=1
        )
188
    else:
189
        lr = params["lr"]
R
root 已提交
190 191
        l2_decay = params["l2_decay"]
        momentum_rate = params["momentum_rate"]
192
        optimizer = fluid.optimizer.Momentum(
193
            learning_rate=lr,
R
root 已提交
194 195
            momentum=momentum_rate,
            regularization=fluid.regularizer.L2Decay(l2_decay))
196

197
    return optimizer
198

199 200 201 202 203 204 205 206 207 208 209
def calc_loss(epsilon,label,class_dim,softmax_out,use_label_smoothing):
    if use_label_smoothing:
        label_one_hot = fluid.layers.one_hot(input=label, depth=class_dim)
        smooth_label = fluid.layers.label_smooth(label=label_one_hot, epsilon=epsilon, dtype="float32")
        loss = fluid.layers.cross_entropy(input=softmax_out, label=smooth_label, soft_label=True)
    else:
        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
    return loss


def net_config(image, model, args, is_train, label=0, y_a=0, y_b=0, lam=0.0):
R
ruri 已提交
210
    model_list = [m for m in dir(models) if "__" not in m]
R
root 已提交
211 212
    assert args.model in model_list, "{} is not lists: {}".format(args.model,
                                                                  model_list)
213 214
    class_dim = args.class_dim
    model_name = args.model
215 216 217
    use_mixup = args.use_mixup
    use_label_smoothing = args.use_label_smoothing
    epsilon = args.label_smoothing_epsilon
218

219 220
    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"
D
Dang Qingqing 已提交
221
        model.params["dropout_seed"] = 100
R
root 已提交
222
        class_dim = 102
223

R
root 已提交
224
    if model_name == "GoogleNet":
225 226 227 228 229 230 231 232 233 234 235
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
236

Y
Yibing Liu 已提交
237
    else:
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
        if not args.is_distill:
            out = model.net(input=image, class_dim=class_dim)
            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
            if is_train:
                if use_mixup:
                    loss_a = calc_loss(epsilon,y_a,class_dim,softmax_out,use_label_smoothing)
                    loss_b = calc_loss(epsilon,y_b,class_dim,softmax_out,use_label_smoothing)
                    loss_a_mean = fluid.layers.mean(x = loss_a)
                    loss_b_mean = fluid.layers.mean(x = loss_b)
                    cost = lam * loss_a_mean + (1 - lam) * loss_b_mean
                    avg_cost = fluid.layers.mean(x=cost)
                    if args.scale_loss > 1:
                        avg_cost = fluid.layers.mean(x=cost) * float(args.scale_loss)
                    return avg_cost
                else:
                    cost = calc_loss(epsilon,label,class_dim,softmax_out,use_label_smoothing)
254

255 256 257 258 259 260 261
            else:
                cost = fluid.layers.cross_entropy(input=softmax_out, label=label)
        else:
            out1, out2 = model.net(input=image, class_dim=args.class_dim)
            softmax_out1, softmax_out = fluid.layers.softmax(out1), fluid.layers.softmax(out2)
            smooth_out1 = fluid.layers.label_smooth(label=softmax_out1, epsilon=0.0, dtype="float32")
            cost = fluid.layers.cross_entropy(input=softmax_out, label=smooth_out1, soft_label=True)
262

263
        avg_cost = fluid.layers.mean(cost)
T
typhoonzero 已提交
264
        if args.scale_loss > 1:
T
update  
typhoonzero 已提交
265
            avg_cost = fluid.layers.mean(x=cost) * float(args.scale_loss)
266 267
        acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5)
268

R
ruri 已提交
269 270 271 272 273 274 275 276 277 278
    return avg_cost, acc_top1, acc_top5

def build_program(is_train, main_prog, startup_prog, args):
    image_shape = [int(m) for m in args.image_shape.split(",")]
    model_name = args.model
    model_list = [m for m in dir(models) if "__" not in m]
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
    model = models.__dict__[model_name]()
    with fluid.program_guard(main_prog, startup_prog):
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
        use_mixup = args.use_mixup
        if is_train and use_mixup:
            py_reader = fluid.layers.py_reader(
                capacity=16,
                shapes=[[-1] + image_shape, [-1, 1], [-1, 1], [-1, 1]],
                lod_levels=[0, 0, 0, 0],
                dtypes=["float32", "int64", "int64", "float32"],
                use_double_buffer=True)
        else:
            py_reader = fluid.layers.py_reader(
                capacity=16,
                shapes=[[-1] + image_shape, [-1, 1]],
                lod_levels=[0, 0],
                dtypes=["float32", "int64"],
                use_double_buffer=True)

R
ruri 已提交
295
        with fluid.unique_name.guard():
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
            if is_train and  use_mixup:
                image, y_a, y_b, lam = fluid.layers.read_file(py_reader)
                if args.fp16:
                    image = fluid.layers.cast(image, "float16")
                avg_cost = net_config(image=image, y_a=y_a, y_b=y_b, lam=lam, model=model, args=args, label=0, is_train=True)
                avg_cost.persistable = True
                build_program_out = [py_reader, avg_cost]
            else:
                image, label = fluid.layers.read_file(py_reader)
                if args.fp16:
                    image = fluid.layers.cast(image, "float16")
                avg_cost, acc_top1, acc_top5 = net_config(image, model, args, label=label, is_train=is_train)
                avg_cost.persistable = True
                acc_top1.persistable = True
                acc_top5.persistable = True
                build_program_out = [py_reader, avg_cost, acc_top1, acc_top5]

R
ruri 已提交
313 314 315 316 317 318 319
            if is_train:
                params = model.params
                params["total_images"] = args.total_images
                params["lr"] = args.lr
                params["num_epochs"] = args.num_epochs
                params["learning_strategy"]["batch_size"] = args.batch_size
                params["learning_strategy"]["name"] = args.lr_strategy
R
root 已提交
320 321
                params["l2_decay"] = args.l2_decay
                params["momentum_rate"] = args.momentum_rate
R
ruri 已提交
322 323

                optimizer = optimizer_setting(params)
T
typhoonzero 已提交
324
                if args.fp16:
T
typhoonzero 已提交
325
                    params_grads = optimizer.backward(avg_cost)
T
typhoonzero 已提交
326 327
                    master_params_grads = create_master_params_grads(
                        params_grads, main_prog, startup_prog, args.scale_loss)
T
update  
typhoonzero 已提交
328
                    optimizer.apply_gradients(master_params_grads)
R
root 已提交
329 330
                    master_param_to_train_param(master_params_grads,
                                                params_grads, main_prog)
T
typhoonzero 已提交
331 332
                else:
                    optimizer.minimize(avg_cost)
R
root 已提交
333
                global_lr = optimizer._global_learning_rate()
334
                global_lr.persistable=True
335
                build_program_out.append(global_lr)
R
ruri 已提交
336

337
    return build_program_out
R
ruri 已提交
338

339
def get_device_num():
340 341 342
    # NOTE(zcd): for multi-processe training, each process use one GPU card.
    if num_trainers > 1 : return 1
    visible_device = os.environ.get('CUDA_VISIBLE_DEVICES', None)
343 344 345 346 347
    if visible_device:
        device_num = len(visible_device.split(','))
    else:
        device_num = subprocess.check_output(['nvidia-smi','-L']).decode().count('\n')
    return device_num
R
ruri 已提交
348 349 350 351 352 353 354 355

def train(args):
    # parameters from arguments
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
356
    use_mixup = args.use_mixup
357

R
ruri 已提交
358 359 360 361 362 363 364
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    if args.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000

365 366 367 368 369 370 371
    b_out = build_program(
                     is_train=True,
                     main_prog=train_prog,
                     startup_prog=startup_prog,
                     args=args)
    if use_mixup:
        train_py_reader, train_cost, global_lr = b_out[0], b_out[1], b_out[2]
372 373 374 375 376
        train_fetch_vars = [train_cost, global_lr]
        train_fetch_list = []
        for var in train_fetch_vars:
            var.persistable=True
            train_fetch_list.append(var.name)
377 378 379

    else:
        train_py_reader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4]
380 381 382 383 384
        train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr]
        train_fetch_list = []
        for var in train_fetch_vars:
            var.persistable=True
            train_fetch_list.append(var.name)
385 386 387 388 389 390 391

    b_out_test = build_program(
                     is_train=False,
                     main_prog=test_prog,
                     startup_prog=startup_prog,
                     args=args)
    test_py_reader, test_cost, test_acc1, test_acc5 = b_out_test[0],b_out_test[1],b_out_test[2],b_out_test[3]
R
ruri 已提交
392
    test_prog = test_prog.clone(for_test=True)
393

394
    if with_memory_optimization:
R
ruri 已提交
395 396
        fluid.memory_optimize(train_prog)
        fluid.memory_optimize(test_prog)
397

398 399
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
400
    exe = fluid.Executor(place)
R
ruri 已提交
401
    exe.run(startup_prog)
402

403
    if checkpoint is not None:
R
ruri 已提交
404
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
405

406 407 408 409 410
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

R
ruri 已提交
411 412
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
413

T
tensor-tang 已提交
414
    if args.use_gpu:
415
        device_num = get_device_num()
R
ruri 已提交
416
    else:
T
tensor-tang 已提交
417
        device_num = 1
R
ruri 已提交
418
    train_batch_size = args.batch_size / device_num
T
tensor-tang 已提交
419

K
kolinwei 已提交
420
    test_batch_size = 16
421
    if not args.enable_ce:
422 423 424 425 426
        # NOTE: the order of batch data generated by batch_reader
        # must be the same in the respective processes.
        shuffle_seed = 1 if num_trainers > 1 else None
        train_reader = reader.train(settings=args, batch_size=train_batch_size, shuffle_seed=shuffle_seed)
        test_reader = reader.val(settings=args, batch_size=test_batch_size)
427 428 429 430 431
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
D
Dang Qingqing 已提交
432
        np.random.seed(0)
433
        train_reader = paddle.batch(
R
ruri 已提交
434 435 436
            flowers.train(use_xmap=False),
            batch_size=train_batch_size,
            drop_last=True)
437 438
        if num_trainers > 1:
            train_reader = fluid.contrib.reader.distributed_batch_reader(train_reader)
439 440 441
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

R
ruri 已提交
442 443
    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)
T
tensor-tang 已提交
444

445 446 447 448 449 450 451

    test_fetch_vars = [test_cost, test_acc1, test_acc5]
    test_fetch_list = []
    for var in test_fetch_vars:
        var.persistable=True
        test_fetch_list.append(var.name)

B
baojun 已提交
452
    # use_ngraph is for CPU only, please refer to README_ngraph.md for details
T
tensor-tang 已提交
453 454
    use_ngraph = os.getenv('FLAGS_use_ngraph')
    if not use_ngraph:
455
        build_strategy = fluid.BuildStrategy()
456 457
        # memopt may affect GC results
        #build_strategy.memory_optimize = args.with_mem_opt
458
        build_strategy.enable_inplace = args.with_inplace
459
        #build_strategy.fuse_all_reduce_ops=1
460 461

        exec_strategy = fluid.ExecutionStrategy()
462
        exec_strategy.num_threads = device_num
463
        exec_strategy.num_iteration_per_drop_scope = 10
464 465 466 467 468
        if num_trainers > 1 and args.use_gpu:
            dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
            # NOTE: the process is fast when num_threads is 1
            # for multi-process training.
            exec_strategy.num_threads = 1
469

T
tensor-tang 已提交
470 471 472
        train_exe = fluid.ParallelExecutor(
            main_program=train_prog,
            use_cuda=bool(args.use_gpu),
473 474 475
            loss_name=train_cost.name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy)
T
tensor-tang 已提交
476 477
    else:
        train_exe = exe
R
ruri 已提交
478 479

    params = models.__dict__[args.model]().params
480
    for pass_id in range(params["num_epochs"]):
R
ruri 已提交
481 482

        train_py_reader.start()
483 484
        train_info = [[], [], []]
        test_info = [[], [], []]
485
        train_time = []
R
ruri 已提交
486
        batch_id = 0
487
        time_record=[]
R
ruri 已提交
488 489 490
        try:
            while True:
                t1 = time.time()
491 492 493 494 495
                if use_mixup:
                    if use_ngraph:
                        loss, lr = train_exe.run(train_prog, fetch_list=train_fetch_list)
                    else:
                        loss, lr = train_exe.run(fetch_list=train_fetch_list)
T
tensor-tang 已提交
496
                else:
497 498 499 500 501 502 503 504 505 506
                    if use_ngraph:
                        loss, acc1, acc5, lr = train_exe.run(train_prog, fetch_list=train_fetch_list)
                    else:
                        loss, acc1, acc5, lr = train_exe.run(fetch_list=train_fetch_list)

                    acc1 = np.mean(np.array(acc1))
                    acc5 = np.mean(np.array(acc5))
                    train_info[1].append(acc1)
                    train_info[2].append(acc5)

R
ruri 已提交
507 508
                t2 = time.time()
                period = t2 - t1
509
                time_record.append(period)
510

R
ruri 已提交
511 512
                loss = np.mean(np.array(loss))
                train_info[0].append(loss)
R
root 已提交
513
                lr = np.mean(np.array(lr))
R
ruri 已提交
514
                train_time.append(period)
R
root 已提交
515

R
ruri 已提交
516
                if batch_id % 10 == 0:
517 518
                    period = np.mean(time_record)
                    time_record=[]
519 520 521 522 523 524 525 526
                    if use_mixup:
                        print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}"
                              .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.2f sec" % period))
                    else:
                        print("Pass {0}, trainbatch {1}, loss {2}, \
                            acc1 {3}, acc5 {4}, lr {5}, time {6}"
                              .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" %
                                      lr, "%2.2f sec" % period))
R
ruri 已提交
527 528 529 530
                    sys.stdout.flush()
                batch_id += 1
        except fluid.core.EOFException:
            train_py_reader.reset()
531 532

        train_loss = np.array(train_info[0]).mean()
533 534 535
        if not use_mixup:
            train_acc1 = np.array(train_info[1]).mean()
            train_acc5 = np.array(train_info[2]).mean()
R
root 已提交
536 537
        train_speed = np.array(train_time).mean() / (train_batch_size *
                                                     device_num)
R
ruri 已提交
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557

        test_py_reader.start()

        test_batch_id = 0
        try:
            while True:
                t1 = time.time()
                loss, acc1, acc5 = exe.run(program=test_prog,
                                           fetch_list=test_fetch_list)
                t2 = time.time()
                period = t2 - t1
                loss = np.mean(loss)
                acc1 = np.mean(acc1)
                acc5 = np.mean(acc5)
                test_info[0].append(loss)
                test_info[1].append(acc1)
                test_info[2].append(acc5)
                if test_batch_id % 10 == 0:
                    print("Pass {0},testbatch {1},loss {2}, \
                        acc1 {3},acc5 {4},time {5}"
558
                          .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5,
R
ruri 已提交
559 560 561 562 563 564 565 566 567
                                  "%2.2f sec" % period))
                    sys.stdout.flush()
                test_batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()

        test_loss = np.array(test_info[0]).mean()
        test_acc1 = np.array(test_info[1]).mean()
        test_acc5 = np.array(test_info[2]).mean()
568

569
        if use_mixup:
570
            print("End pass {0}, train_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}".format(
571 572 573 574 575 576 577
                      pass_id, "%.5f"%train_loss, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5))
        else:

            print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
                  "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(
                      pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss,
                      "%.5f"%test_acc1, "%.5f"%test_acc5))
578 579
        sys.stdout.flush()

580
        model_path = os.path.join(model_save_dir + '/' + model_name,
581
                                  str(pass_id))
582 583
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
R
ruri 已提交
584
        fluid.io.save_persistables(exe, model_path, main_program=train_prog)
585

586 587
        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
R
ruri 已提交
588
            if device_num == 1:
D
Dang Qingqing 已提交
589
                # Use the mean cost/acc for training
590 591 592 593 594 595 596 597 598
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
                print("kpis	train_acc_top5	%s" % train_acc5)
                # Use the mean cost/acc for testing
                print("kpis	test_cost	%s" % test_loss)
                print("kpis	test_acc_top1	%s" % test_acc1)
                print("kpis	test_acc_top5	%s" % test_acc5)
                print("kpis	train_speed	%s" % train_speed)
            else:
D
Dang Qingqing 已提交
599
                # Use the mean cost/acc for training
R
ruri 已提交
600 601 602 603 604
                print("kpis	train_cost_card%s	%s" % (device_num, train_loss))
                print("kpis	train_acc_top1_card%s	%s" %
                      (device_num, train_acc1))
                print("kpis	train_acc_top5_card%s	%s" %
                      (device_num, train_acc5))
605
                # Use the mean cost/acc for testing
R
ruri 已提交
606 607 608 609
                print("kpis	test_cost_card%s	%s" % (device_num, test_loss))
                print("kpis	test_acc_top1_card%s	%s" % (device_num, test_acc1))
                print("kpis	test_acc_top5_card%s	%s" % (device_num, test_acc5))
                print("kpis	train_speed_card%s	%s" % (device_num, train_speed))
610

611

612
def main():
613 614
    args = parser.parse_args()
    print_arguments(args)
R
ruri 已提交
615
    check_gpu(args.use_gpu)
616
    train(args)
617

618 619 620

if __name__ == '__main__':
    main()