train.py 11.5 KB
Newer Older
R
root 已提交
1 2 3
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
4 5 6 7
import os
import numpy as np
import time
import sys
R
root 已提交
8 9
import functools
import math
10
import paddle
11
import paddle.fluid as fluid
12
import paddle.dataset.flowers as flowers
13
import models
14 15
import reader
import argparse
16
from models.learning_rate import cosine_decay
17 18 19 20
from utility import add_arguments, print_arguments

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
21 22 23 24 25 26 27 28 29 30 31 32 33 34
# yapf: disable
add_arg('batch_size',       int,   256,                  "Minibatch size.")
add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
add_arg('total_images',     int,   1281167,              "Training image number.")
add_arg('num_epochs',       int,   120,                  "number of epochs.")
add_arg('class_dim',        int,   1000,                 "Class number.")
add_arg('image_shape',      str,   "3,224,224",          "input image size")
add_arg('model_save_dir',   str,   "output",             "model save directory")
add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
add_arg('lr',               float, 0.1,                  "set learning rate.")
add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
35
add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
M
minqiyang 已提交
36
add_arg('data_dir',         str,   "./data/ILSVRC2012",  "The ImageNet dataset root dir.")
37 38 39 40 41 42 43 44 45 46 47
# yapf: enable

model_list = [m for m in dir(models) if "__" not in m]


def optimizer_setting(params):
    ls = params["learning_strategy"]

    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
            total_images = 1281167
Y
Yibing Liu 已提交
48
        else:
49
            total_images = params["total_images"]
50

51 52
        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)
D
Dang Qingqing 已提交
53

54 55 56 57
        bd = [step * e for e in ls["epochs"]]
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
58
        optimizer = fluid.optimizer.Momentum(
59 60 61 62
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
63 64 65 66 67 68 69 70 71 72 73 74
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
            total_images = 1281167
        else:
            total_images = params["total_images"]

        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)

        lr = params["lr"]
        num_epochs = params["num_epochs"]

75 76
        optimizer = fluid.optimizer.Momentum(
            learning_rate=cosine_decay(
77
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
78 79 80
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    else:
81
        lr = params["lr"]
82
        optimizer = fluid.optimizer.Momentum(
83
            learning_rate=lr,
84 85 86
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))

87
    return optimizer
88 89


90 91 92 93 94 95 96 97 98
def train(args):
    # parameters from arguments
    class_dim = args.class_dim
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
    image_shape = [int(m) for m in args.image_shape.split(",")]
D
Dang Qingqing 已提交
99

100 101
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
102 103 104

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
105 106 107 108

    # model definition
    model = models.__dict__[model_name]()

109 110
    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"
D
Dang Qingqing 已提交
111
        fluid.default_startup_program().random_seed = 1000
D
Dang Qingqing 已提交
112
        model.params["dropout_seed"] = 100
R
root 已提交
113
        class_dim = 102
114

R
root 已提交
115
    if model_name == "GoogleNet":
116 117 118 119 120 121 122 123 124 125 126
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
Y
Yibing Liu 已提交
127
    else:
128 129
        out = model.net(input=image, class_dim=class_dim)
        cost = fluid.layers.cross_entropy(input=out, label=label)
130

131 132 133
        avg_cost = fluid.layers.mean(x=cost)
        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
134 135 136

    test_program = fluid.default_main_program().clone(for_test=True)

137 138 139 140 141 142 143
    # parameters from model and arguments
    params = model.params
    params["total_images"] = args.total_images
    params["lr"] = args.lr
    params["num_epochs"] = args.num_epochs
    params["learning_strategy"]["batch_size"] = args.batch_size
    params["learning_strategy"]["name"] = args.lr_strategy
144

145 146
    # initialize optimizer
    optimizer = optimizer_setting(params)
147 148
    opts = optimizer.minimize(avg_cost)

149
    if with_memory_optimization:
150 151
        fluid.memory_optimize(fluid.default_main_program())

152
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
153 154 155
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

156 157
    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint)
158

159 160 161 162 163 164 165
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

166 167
    train_batch_size = args.batch_size
    test_batch_size = 16
168 169 170 171 172 173 174 175 176

    if not args.enable_ce:
        train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
        test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
D
Dang Qingqing 已提交
177
        np.random.seed(0)
178 179 180 181 182
        train_reader = paddle.batch(
            flowers.train(use_xmap=False), batch_size=train_batch_size)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

183 184
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

L
Luo Tao 已提交
185 186
    train_exe = fluid.ParallelExecutor(
        use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name)
187 188 189

    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]

190 191
    gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    gpu_nums = len(gpu.split(","))
192
    for pass_id in range(params["num_epochs"]):
193 194
        train_info = [[], [], []]
        test_info = [[], [], []]
195
        train_time = []
196 197
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
Y
Yibing Liu 已提交
198
            loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
199 200 201 202 203 204 205 206
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)
207
            train_time.append(period)
208 209 210 211 212 213 214 215 216 217 218
            if batch_id % 10 == 0:
                print("Pass {0}, trainbatch {1}, loss {2}, \
                       acc1 {3}, acc5 {4} time {5}"
                                                   .format(pass_id, \
                       batch_id, loss, acc1, acc5, \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
219
        train_speed = np.array(train_time).mean() / train_batch_size
220 221
        cnt = 0
        for test_batch_id, data in enumerate(test_reader()):
222
            t1 = time.time()
223 224 225
            loss, acc1, acc5 = exe.run(test_program,
                                       fetch_list=fetch_list,
                                       feed=feeder.feed(data))
226 227
            t2 = time.time()
            period = t2 - t1
228 229 230 231 232 233 234 235
            loss = np.mean(loss)
            acc1 = np.mean(acc1)
            acc5 = np.mean(acc5)
            test_info[0].append(loss * len(data))
            test_info[1].append(acc1 * len(data))
            test_info[2].append(acc5 * len(data))
            cnt += len(data)
            if test_batch_id % 10 == 0:
236 237 238
                print("Pass {0},testbatch {1},loss {2}, \
                       acc1 {3},acc5 {4},time {5}"
                                                  .format(pass_id, \
239
                       test_batch_id, loss, acc1, acc5, \
240 241 242
                       "%2.2f sec" % period))
                sys.stdout.flush()

243 244 245
        test_loss = np.sum(test_info[0]) / cnt
        test_acc1 = np.sum(test_info[1]) / cnt
        test_acc5 = np.sum(test_info[2]) / cnt
246

247 248
        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \
249 250 251 252
              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
              test_acc5))
        sys.stdout.flush()

253
        model_path = os.path.join(model_save_dir + '/' + model_name,
254
                                  str(pass_id))
255 256 257 258
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)

259 260 261
        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
            if gpu_nums == 1:
D
Dang Qingqing 已提交
262
                # Use the mean cost/acc for training
263 264 265 266 267 268 269 270 271
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
                print("kpis	train_acc_top5	%s" % train_acc5)
                # Use the mean cost/acc for testing
                print("kpis	test_cost	%s" % test_loss)
                print("kpis	test_acc_top1	%s" % test_acc1)
                print("kpis	test_acc_top5	%s" % test_acc5)
                print("kpis	train_speed	%s" % train_speed)
            else:
D
Dang Qingqing 已提交
272
                # Use the mean cost/acc for training
D
Dang Qingqing 已提交
273 274 275
                print("kpis	train_cost_card%s	%s" % (gpu_nums, train_loss))
                print("kpis	train_acc_top1_card%s	%s" % (gpu_nums, train_acc1))
                print("kpis	train_acc_top5_card%s	%s" % (gpu_nums, train_acc5))
276
                # Use the mean cost/acc for testing
D
Dang Qingqing 已提交
277 278 279 280
                print("kpis	test_cost_card%s	%s" % (gpu_nums, test_loss))
                print("kpis	test_acc_top1_card%s	%s" % (gpu_nums, test_acc1))
                print("kpis	test_acc_top5_card%s	%s" % (gpu_nums, test_acc5))
                print("kpis	train_speed_card%s	%s" % (gpu_nums, train_speed))
281

282

283
def main():
284 285
    args = parser.parse_args()
    print_arguments(args)
286
    train(args)
287

288 289 290

if __name__ == '__main__':
    main()