train.py 11.6 KB
Newer Older
R
root 已提交
1 2 3
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
4 5 6 7
import os
import numpy as np
import time
import sys
R
root 已提交
8 9
import functools
import math
10
import paddle
11
import paddle.fluid as fluid
12
import paddle.dataset.flowers as flowers
13
import models
14 15
import reader
import argparse
16
from models.learning_rate import cosine_decay
17 18 19 20
from utility import add_arguments, print_arguments

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
21 22 23 24 25 26 27 28 29 30 31 32 33 34
# yapf: disable
add_arg('batch_size',       int,   256,                  "Minibatch size.")
add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
add_arg('total_images',     int,   1281167,              "Training image number.")
add_arg('num_epochs',       int,   120,                  "number of epochs.")
add_arg('class_dim',        int,   1000,                 "Class number.")
add_arg('image_shape',      str,   "3,224,224",          "input image size")
add_arg('model_save_dir',   str,   "output",             "model save directory")
add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
add_arg('lr',               float, 0.1,                  "set learning rate.")
add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
35
add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
36 37 38 39 40 41 42 43 44 45 46
# yapf: enable

model_list = [m for m in dir(models) if "__" not in m]


def optimizer_setting(params):
    ls = params["learning_strategy"]

    if ls["name"] == "piecewise_decay":
        if "total_images" not in params:
            total_images = 1281167
Y
Yibing Liu 已提交
47
        else:
48
            total_images = params["total_images"]
49

50 51
        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)
D
Dang Qingqing 已提交
52

53 54 55 56
        bd = [step * e for e in ls["epochs"]]
        base_lr = params["lr"]
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
57
        optimizer = fluid.optimizer.Momentum(
58 59 60 61
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=bd, values=lr),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
62 63 64 65 66 67 68 69 70 71 72 73
    elif ls["name"] == "cosine_decay":
        if "total_images" not in params:
            total_images = 1281167
        else:
            total_images = params["total_images"]

        batch_size = ls["batch_size"]
        step = int(total_images / batch_size + 1)

        lr = params["lr"]
        num_epochs = params["num_epochs"]

74 75
        optimizer = fluid.optimizer.Momentum(
            learning_rate=cosine_decay(
76
                learning_rate=lr, step_each_epoch=step, epochs=num_epochs),
77 78 79
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))
    else:
80
        lr = params["lr"]
81
        optimizer = fluid.optimizer.Momentum(
82
            learning_rate=lr,
83 84 85
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(1e-4))

86
    return optimizer
87 88


89 90 91 92 93 94 95 96 97
def train(args):
    # parameters from arguments
    class_dim = args.class_dim
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt
    model_save_dir = args.model_save_dir
    image_shape = [int(m) for m in args.image_shape.split(",")]
D
Dang Qingqing 已提交
98

99 100
    assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                     model_list)
101 102 103

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
104 105 106 107

    # model definition
    model = models.__dict__[model_name]()

108 109
    if args.enable_ce:
        assert model_name == "SE_ResNeXt50_32x4d"
D
Dang Qingqing 已提交
110 111
        fluid.default_startup_program().random_seed = 1000
        model.params["dropout_seed"] = 100
R
root 已提交
112
        class_dim = 102
113

R
root 已提交
114
    if model_name == "GoogleNet":
115 116 117 118 119 120 121 122 123 124 125
        out0, out1, out2 = model.net(input=image, class_dim=class_dim)
        cost0 = fluid.layers.cross_entropy(input=out0, label=label)
        cost1 = fluid.layers.cross_entropy(input=out1, label=label)
        cost2 = fluid.layers.cross_entropy(input=out2, label=label)
        avg_cost0 = fluid.layers.mean(x=cost0)
        avg_cost1 = fluid.layers.mean(x=cost1)
        avg_cost2 = fluid.layers.mean(x=cost2)

        avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2
        acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5)
Y
Yibing Liu 已提交
126
    else:
127 128
        out = model.net(input=image, class_dim=class_dim)
        cost = fluid.layers.cross_entropy(input=out, label=label)
129

130 131 132
        avg_cost = fluid.layers.mean(x=cost)
        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
133 134 135

    test_program = fluid.default_main_program().clone(for_test=True)

136 137 138 139 140 141 142
    # parameters from model and arguments
    params = model.params
    params["total_images"] = args.total_images
    params["lr"] = args.lr
    params["num_epochs"] = args.num_epochs
    params["learning_strategy"]["batch_size"] = args.batch_size
    params["learning_strategy"]["name"] = args.lr_strategy
143

144 145
    # initialize optimizer
    optimizer = optimizer_setting(params)
146 147
    opts = optimizer.minimize(avg_cost)

148
    if with_memory_optimization:
149 150
        fluid.memory_optimize(fluid.default_main_program())

151
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
152 153 154
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

155 156
    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint)
157

158 159 160 161 162 163 164
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

165 166
    train_batch_size = args.batch_size
    test_batch_size = 16
167 168 169 170 171 172 173 174 175

    if not args.enable_ce:
        train_reader = paddle.batch(reader.train(), batch_size=train_batch_size)
        test_reader = paddle.batch(reader.val(), batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
        import random
        random.seed(0)
D
Dang Qingqing 已提交
176
        np.random.seed(0)
177 178 179 180 181
        train_reader = paddle.batch(
            flowers.train(use_xmap=False), batch_size=train_batch_size)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

182 183
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

L
Luo Tao 已提交
184 185
    train_exe = fluid.ParallelExecutor(
        use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name)
186 187 188

    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]

189 190
    gpu = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    gpu_nums = len(gpu.split(","))
191
    for pass_id in range(params["num_epochs"]):
192 193
        train_info = [[], [], []]
        test_info = [[], [], []]
194
        train_time = []
195 196
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
Y
Yibing Liu 已提交
197
            loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data))
198 199 200 201 202 203 204 205
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)
206
            train_time.append(period)
207 208 209 210 211 212 213 214 215 216 217
            if batch_id % 10 == 0:
                print("Pass {0}, trainbatch {1}, loss {2}, \
                       acc1 {3}, acc5 {4} time {5}"
                                                   .format(pass_id, \
                       batch_id, loss, acc1, acc5, \
                       "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
218
        train_speed = np.array(train_time).mean() / train_batch_size
219 220
        cnt = 0
        for test_batch_id, data in enumerate(test_reader()):
221
            t1 = time.time()
222 223 224
            loss, acc1, acc5 = exe.run(test_program,
                                       fetch_list=fetch_list,
                                       feed=feeder.feed(data))
225 226
            t2 = time.time()
            period = t2 - t1
227 228 229 230 231 232 233 234
            loss = np.mean(loss)
            acc1 = np.mean(acc1)
            acc5 = np.mean(acc5)
            test_info[0].append(loss * len(data))
            test_info[1].append(acc1 * len(data))
            test_info[2].append(acc5 * len(data))
            cnt += len(data)
            if test_batch_id % 10 == 0:
235 236 237
                print("Pass {0},testbatch {1},loss {2}, \
                       acc1 {3},acc5 {4},time {5}"
                                                  .format(pass_id, \
238
                       test_batch_id, loss, acc1, acc5, \
239 240 241
                       "%2.2f sec" % period))
                sys.stdout.flush()

242 243 244
        test_loss = np.sum(test_info[0]) / cnt
        test_acc1 = np.sum(test_info[1]) / cnt
        test_acc5 = np.sum(test_info[2]) / cnt
245

246 247
        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, "
              "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \
248 249 250 251
              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
              test_acc5))
        sys.stdout.flush()

252
        model_path = os.path.join(model_save_dir + '/' + model_name,
253
                                  str(pass_id))
254 255 256 257
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
        # This is for continuous evaluation only
        if args.enable_ce and pass_id == args.num_epochs - 1:
            if gpu_nums == 1:
                # Use the last cost/acc for training
                print("kpis	train_cost	%s" % train_loss)
                print("kpis	train_acc_top1	%s" % train_acc1)
                print("kpis	train_acc_top5	%s" % train_acc5)
                # Use the mean cost/acc for testing
                print("kpis	test_cost	%s" % test_loss)
                print("kpis	test_acc_top1	%s" % test_acc1)
                print("kpis	test_acc_top5	%s" % test_acc5)
                print("kpis	train_speed	%s" % train_speed)
            else:
                # Use the last cost/acc for training
                print("kpis    train_cost_card%s       %s" %
                      (gpu_nums, train_loss))
                print("kpis    train_acc_top1_card%s   %s" %
                      (gpu_nums, train_acc1))
                print("kpis    train_acc_top5_card%s   %s" %
                      (gpu_nums, train_acc5))
                # Use the mean cost/acc for testing
                print("kpis    test_cost_card%s        %s" %
                      (gpu_nums, test_loss))
                print("kpis    test_acc_top1_card%s    %s" %
                      (gpu_nums, test_acc1))
                print("kpis    test_acc_top5_card%s    %s" %
                      (gpu_nums, test_acc5))
                print("kpis    train_speed_card%s      %s" %
                      (gpu_nums, train_speed))

288

289
def main():
290 291
    args = parser.parse_args()
    print_arguments(args)
292
    train(args)
293

294 295 296

if __name__ == '__main__':
    main()