train.py 15.2 KB
Newer Older
1 2 3 4
import os
import numpy as np
import time
import sys
5
import paddle
6 7
import paddle.fluid as fluid
from se_resnext import SE_ResNeXt
8
from mobilenet import mobile_net
Y
Yibing Liu 已提交
9
from inception_v4 import inception_v4
10 11 12
import reader
import argparse
import functools
13
import paddle.fluid.layers.ops as ops
14
from utility import add_arguments, print_arguments
15 16 17
from paddle.fluid.initializer import init_on_cpu
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
import math
18 19 20

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
add_arg('batch_size', int, 256, "Minibatch size.")
add_arg('num_layers', int, 50, "How many layers for SE-ResNeXt model.")
add_arg('with_mem_opt', bool, True,
        "Whether to use memory optimization or not.")
add_arg('parallel_exe', bool, True,
        "Whether to use ParallelExecutor to train or not.")
add_arg('init_model', str, None, "Whether to use initialized model.")
add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
add_arg('lr_strategy', str, "cosine_decay",
        "Set the learning rate decay strategy.")
add_arg('model', str, "se_resnext", "Set the network to use.")


def cosine_decay(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        epoch = ops.floor(global_step / step_each_epoch)
        decayed_lr = learning_rate * \
                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
    return decayed_lr
D
Dang Qingqing 已提交
45 46 47 48 49 50 51


def train_parallel_do(args,
                      learning_rate,
                      batch_size,
                      num_passes,
                      init_model=None,
52
                      pretrained_model=None,
D
Dang Qingqing 已提交
53 54 55 56 57
                      model_save_dir='model',
                      parallel=True,
                      use_nccl=True,
                      lr_strategy=None,
                      layers=50):
58 59 60 61 62 63 64 65 66 67 68 69 70
    class_dim = 1000
    image_shape = [3, 224, 224]

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    if parallel:
        places = fluid.layers.get_places()
        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)

        with pd.do():
            image_ = pd.read_input(image)
            label_ = pd.read_input(label)
71 72 73
            if args.model is 'se_resnext':
                out = SE_ResNeXt(
                    input=image_, class_dim=class_dim, layers=layers)
Y
Yibing Liu 已提交
74
            elif args.model is 'mobile_net':
75
                out = mobile_net(img=image_, class_dim=class_dim)
Y
Yibing Liu 已提交
76 77
            else:
                out = inception_v4(img=image_, class_dim=class_dim)
78

79 80 81 82 83 84 85 86 87 88 89 90 91
            cost = fluid.layers.cross_entropy(input=out, label=label_)
            avg_cost = fluid.layers.mean(x=cost)
            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
            pd.write_output(avg_cost)
            pd.write_output(acc_top1)
            pd.write_output(acc_top5)

        avg_cost, acc_top1, acc_top5 = pd()
        avg_cost = fluid.layers.mean(x=avg_cost)
        acc_top1 = fluid.layers.mean(x=acc_top1)
        acc_top5 = fluid.layers.mean(x=acc_top5)
    else:
92 93
        if args.model is 'se_resnext':
            out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
Y
Yibing Liu 已提交
94
        elif args.model is 'mobile_net':
95
            out = mobile_net(img=image, class_dim=class_dim)
Y
Yibing Liu 已提交
96 97
        else:
            out = inception_v4(img=image, class_dim=class_dim)
98

99 100 101 102 103
        cost = fluid.layers.cross_entropy(input=out, label=label)
        avg_cost = fluid.layers.mean(x=cost)
        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)

D
Dang Qingqing 已提交
104 105
    inference_program = fluid.default_main_program().clone(for_test=True)

106 107 108
    if "piecewise_decay" in lr_strategy:
        bd = lr_strategy["piecewise_decay"]["bd"]
        lr = lr_strategy["piecewise_decay"]["lr"]
109
        optimizer = fluid.optimizer.Momentum(
110 111 112 113 114 115 116 117 118 119 120 121
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    elif "cosine_decay" in lr_strategy:
        step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"]
        epochs = lr_strategy["cosine_decay"]["epochs"]
        optimizer = fluid.optimizer.Momentum(
            learning_rate=cosine_decay(
                learning_rate=learning_rate,
                step_each_epoch=step_each_epoch,
                epochs=epochs),
122 123 124 125
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    else:
        optimizer = fluid.optimizer.Momentum(
126
            learning_rate=learning_rate,
127 128 129 130 131 132 133 134 135 136 137 138 139 140
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))

    opts = optimizer.minimize(avg_cost)
    if args.with_mem_opt:
        fluid.memory_optimize(fluid.default_main_program())

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if init_model is not None:
        fluid.io.load_persistables(exe, init_model)

141 142 143 144 145 146 147
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

    for pass_id in range(num_passes):
        train_info = [[], [], []]
        test_info = [[], [], []]
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
            loss, acc1, acc5 = exe.run(
                fluid.default_main_program(),
                feed=feeder.feed(data),
                fetch_list=[avg_cost, acc_top1, acc_top5])
            t2 = time.time()
            period = t2 - t1
            train_info[0].append(loss[0])
            train_info[1].append(acc1[0])
            train_info[2].append(acc5[0])
            if batch_id % 10 == 0:
                print("Pass {0}, trainbatch {1}, loss {2}, \
                       acc1 {3}, acc5 {4} time {5}"
                                                   .format(pass_id, \
                       batch_id, loss[0], acc1[0], acc5[0], \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
        for data in test_reader():
            t1 = time.time()
            loss, acc1, acc5 = exe.run(
                inference_program,
                feed=feeder.feed(data),
                fetch_list=[avg_cost, acc_top1, acc_top5])
            t2 = time.time()
            period = t2 - t1
            test_info[0].append(loss[0])
            test_info[1].append(acc1[0])
            test_info[2].append(acc5[0])
            if batch_id % 10 == 0:
                print("Pass {0},testbatch {1},loss {2}, \
                       acc1 {3},acc5 {4},time {5}"
                                                  .format(pass_id, \
                       batch_id, loss[0], acc1[0], acc5[0], \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        test_loss = np.array(test_info[0]).mean()
        test_acc1 = np.array(test_info[1]).mean()
        test_acc5 = np.array(test_info[2]).mean()

        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
                                                           .format(pass_id, \
              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
              test_acc5))
        sys.stdout.flush()

207 208
        model_path = os.path.join(model_save_dir + '/' + args.model,
                                  str(pass_id))
209 210 211 212
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)

D
Dang Qingqing 已提交
213

214 215 216 217 218
def train_parallel_exe(args,
                       learning_rate,
                       batch_size,
                       num_passes,
                       init_model=None,
219
                       pretrained_model=None,
220 221 222 223 224
                       model_save_dir='model',
                       parallel=True,
                       use_nccl=True,
                       lr_strategy=None,
                       layers=50):
Y
Yibing Liu 已提交
225
    class_dim = 1000
226 227 228 229
    image_shape = [3, 224, 224]

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
230 231
    if args.model is 'se_resnext':
        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
Y
Yibing Liu 已提交
232
    elif args.model is 'mobile_net':
233
        out = mobile_net(img=image, class_dim=class_dim)
Y
Yibing Liu 已提交
234 235
    else:
        out = inception_v4(img=image, class_dim=class_dim)
236

237 238 239 240 241 242 243
    cost = fluid.layers.cross_entropy(input=out, label=label)
    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
    avg_cost = fluid.layers.mean(x=cost)

    test_program = fluid.default_main_program().clone(for_test=True)

244 245 246
    if "piecewise_decay" in lr_strategy:
        bd = lr_strategy["piecewise_decay"]["bd"]
        lr = lr_strategy["piecewise_decay"]["lr"]
247
        optimizer = fluid.optimizer.Momentum(
248 249 250 251 252 253 254 255 256 257 258 259
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    elif "cosine_decay" in lr_strategy:
        step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"]
        epochs = lr_strategy["cosine_decay"]["epochs"]
        optimizer = fluid.optimizer.Momentum(
            learning_rate=cosine_decay(
                learning_rate=learning_rate,
                step_each_epoch=step_each_epoch,
                epochs=epochs),
260 261 262 263
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    else:
        optimizer = fluid.optimizer.Momentum(
264
            learning_rate=learning_rate,
265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))

    opts = optimizer.minimize(avg_cost)

    if args.with_mem_opt:
        fluid.memory_optimize(fluid.default_main_program())

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if init_model is not None:
        fluid.io.load_persistables(exe, init_model)

280 281 282 283 284 285 286
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

287 288
    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
Y
Yibing Liu 已提交
289

290 291 292 293
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    test_exe = fluid.ParallelExecutor(
D
Dang Qingqing 已提交
294
        use_cuda=True, main_program=test_program, share_vars_from=train_exe)
295 296 297 298 299 300 301 302

    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]

    for pass_id in range(num_passes):
        train_info = [[], [], []]
        test_info = [[], [], []]
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
Y
Yibing Liu 已提交
303
            loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)
            if batch_id % 10 == 0:
                print("Pass {0}, trainbatch {1}, loss {2}, \
                       acc1 {3}, acc5 {4} time {5}"
                                                   .format(pass_id, \
                       batch_id, loss, acc1, acc5, \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
        for data in test_reader():
            t1 = time.time()
Y
Yibing Liu 已提交
325
            loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data))
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            test_info[0].append(loss)
            test_info[1].append(acc1)
            test_info[2].append(acc5)
            if batch_id % 10 == 0:
                print("Pass {0},testbatch {1},loss {2}, \
                       acc1 {3},acc5 {4},time {5}"
                                                  .format(pass_id, \
                       batch_id, loss, acc1, acc5, \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        test_loss = np.array(test_info[0]).mean()
        test_acc1 = np.array(test_info[1]).mean()
        test_acc5 = np.array(test_info[2]).mean()

        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
                                                           .format(pass_id, \
              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
              test_acc5))
        sys.stdout.flush()

353 354
        model_path = os.path.join(model_save_dir + '/' + args.model,
                                  str(pass_id))
355 356 357 358 359 360 361 362 363 364 365 366
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)


if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)

    total_images = 1281167
    batch_size = args.batch_size
    step = int(total_images / batch_size + 1)
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
    num_epochs = 120

    learning_rate_mode = args.lr_strategy
    lr_strategy = {}
    if learning_rate_mode == "piecewise_decay":
        epoch_points = [30, 60, 90]
        bd = [e * step for e in epoch_points]
        lr = [0.1, 0.01, 0.001, 0.0001]
        lr_strategy[learning_rate_mode] = {"bd": bd, "lr": lr}
    elif learning_rate_mode == "cosine_decay":
        lr_strategy[learning_rate_mode] = {
            "step_each_epoch": step,
            "epochs": num_epochs
        }
    else:
        lr_strategy = None
383 384 385 386 387

    use_nccl = True
    # layers: 50, 152
    layers = args.num_layers
    method = train_parallel_exe if args.parallel_exe else train_parallel_do
388 389
    init_model = args.init_model if args.init_model else None
    pretrained_model = args.pretrained_model if args.pretrained_model else None
D
Dang Qingqing 已提交
390 391 392 393
    method(
        args,
        learning_rate=0.1,
        batch_size=batch_size,
394 395 396
        num_passes=num_epochs,
        init_model=init_model,
        pretrained_model=pretrained_model,
D
Dang Qingqing 已提交
397 398 399 400
        parallel=True,
        use_nccl=True,
        lr_strategy=lr_strategy,
        layers=layers)