train.py 12.8 KB
Newer Older
1
import os
X
Xingyuan Bu 已提交
2
import time
3 4 5
import numpy as np
import argparse
import functools
D
Dang Qingqing 已提交
6
import shutil
7

D
Dang Qingqing 已提交
8 9 10 11 12 13
import paddle
import paddle.fluid as fluid
import reader
from mobilenet_ssd import mobile_net
from utility import add_arguments, print_arguments

14 15
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
16 17 18
# yapf: disable
add_arg('learning_rate',    float, 0.001,     "Learning rate.")
add_arg('batch_size',       int,   32,        "Minibatch size.")
D
Dang Qingqing 已提交
19
add_arg('num_passes',       int,   120,       "Epoch number.")
20
add_arg('parallel',         bool,  True,      "Whether use parallel training.")
D
Dang Qingqing 已提交
21 22
add_arg('use_gpu',          bool,  True,      "Whether to use GPU or not.")
add_arg('use_nccl',         bool,  False,     "Whether to use NCCL or not.")
23 24
add_arg('dataset',          str, 'pascalvoc', "coco or pascalvoc.")
add_arg('model_save_dir',   str, 'model',     "The path to save model.")
25
add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
D
Dang Qingqing 已提交
26 27 28
add_arg('apply_distort',    bool, True,       "Whether apply distort")
add_arg('apply_expand',     bool, True,       "Whether appley expand")
add_arg('ap_version',       str,  '11point',  "11point or integral")
D
Dang Qingqing 已提交
29 30 31 32 33
add_arg('resize_h',         int,  300,        "The resized image height.")
add_arg('resize_w',         int,  300,        "The resized image width.")
add_arg('mean_value_B',     float, 127.5,     "mean value for B channel which will be subtracted")  #123.68
add_arg('mean_value_G',     float, 127.5,     "mean value for G channel which will be subtracted")  #116.78
add_arg('mean_value_R',     float, 127.5,     "mean value for R channel which will be subtracted")  #103.94
34
add_arg('is_toy',           int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample")
D
Dang Qingqing 已提交
35
# yapf: enable
36 37 38 39 40 41 42 43 44 45 46


def parallel_do(args,
                train_file_list,
                val_file_list,
                data_args,
                learning_rate,
                batch_size,
                num_passes,
                model_save_dir,
                pretrained_model=None):
D
dangqingqing 已提交
47
    image_shape = [3, data_args.resize_h, data_args.resize_w]
X
Xingyuan Bu 已提交
48 49 50 51
    if data_args.dataset == 'coco':
        num_classes = 81
    elif data_args.dataset == 'pascalvoc':
        num_classes = 21
D
dangqingqing 已提交
52 53 54 55 56 57 58 59 60

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    gt_box = fluid.layers.data(
        name='gt_box', shape=[4], dtype='float32', lod_level=1)
    gt_label = fluid.layers.data(
        name='gt_label', shape=[1], dtype='int32', lod_level=1)
    difficult = fluid.layers.data(
        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)

61 62
    if args.parallel:
        places = fluid.layers.get_places()
D
Dang Qingqing 已提交
63
        pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl)
64 65 66 67 68
        with pd.do():
            image_ = pd.read_input(image)
            gt_box_ = pd.read_input(gt_box)
            gt_label_ = pd.read_input(gt_label)
            difficult_ = pd.read_input(difficult)
X
Xingyuan Bu 已提交
69 70 71 72
            locs, confs, box, box_var = mobile_net(num_classes, image_,
                                                   image_shape)
            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box,
                                         box_var)
73 74
            nmsed_out = fluid.layers.detection_output(
                locs, confs, box, box_var, nms_threshold=0.45)
75
            loss = fluid.layers.reduce_sum(loss)
76
            pd.write_output(loss)
77
            pd.write_output(nmsed_out)
78

79
        loss, nmsed_out = pd()
80
        loss = fluid.layers.mean(loss)
81
    else:
X
Xingyuan Bu 已提交
82
        locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
83
        nmsed_out = fluid.layers.detection_output(
84
            locs, confs, box, box_var, nms_threshold=0.45)
X
Xingyuan Bu 已提交
85 86
        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
                                     box_var)
87
        loss = fluid.layers.reduce_sum(loss)
D
dangqingqing 已提交
88 89 90

    test_program = fluid.default_main_program().clone(for_test=True)
    with fluid.program_guard(test_program):
91 92 93 94 95
        map_eval = fluid.evaluator.DetectionMAP(
            nmsed_out,
            gt_label,
            gt_box,
            difficult,
X
Xingyuan Bu 已提交
96
            num_classes,
97 98
            overlap_threshold=0.5,
            evaluate_difficult=False,
D
Dang Qingqing 已提交
99
            ap_version=args.ap_version)
X
Xingyuan Bu 已提交
100 101 102 103 104 105 106 107 108 109

    if data_args.dataset == 'coco':
        # learning rate decay in 12, 19 pass, respectively
        if '2014' in train_file_list:
            boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
        elif '2017' in train_file_list:
            boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
    elif data_args.dataset == 'pascalvoc':
        boundaries = [40000, 60000]
    values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
G
gaoyuan 已提交
110 111
    optimizer = fluid.optimizer.RMSProp(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
G
gaoyuan 已提交
112
        regularization=fluid.regularizer.L2Decay(0.00005), )
D
dangqingqing 已提交
113

114
    optimizer.minimize(loss)
D
dangqingqing 已提交
115

116
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
D
dangqingqing 已提交
117 118 119
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

120
    if pretrained_model:
D
Dang Qingqing 已提交
121

122 123
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
D
Dang Qingqing 已提交
124

125 126
        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

D
dangqingqing 已提交
127 128 129 130 131 132 133 134
    train_reader = paddle.batch(
        reader.train(data_args, train_file_list), batch_size=batch_size)
    test_reader = paddle.batch(
        reader.test(data_args, val_file_list), batch_size=batch_size)
    feeder = fluid.DataFeeder(
        place=place, feed_list=[image, gt_box, gt_label, difficult])

    def test(pass_id):
135
        _, accum_map = map_eval.get_map_var()
D
dangqingqing 已提交
136 137 138 139 140 141 142 143 144
        map_eval.reset(exe)
        test_map = None
        for _, data in enumerate(test_reader()):
            test_map = exe.run(test_program,
                               feed=feeder.feed(data),
                               fetch_list=[accum_map])
        print("Test {0}, map {1}".format(pass_id, test_map[0]))

    for pass_id in range(num_passes):
X
Xingyuan Bu 已提交
145 146 147
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
D
dangqingqing 已提交
148
        for batch_id, data in enumerate(train_reader()):
X
Xingyuan Bu 已提交
149 150
            prev_start_time = start_time
            start_time = time.time()
151 152 153
            loss_v = exe.run(fluid.default_main_program(),
                             feed=feeder.feed(data),
                             fetch_list=[loss])
X
Xingyuan Bu 已提交
154
            end_time = time.time()
G
gaoyuan 已提交
155
            if batch_id % 20 == 0:
X
Xingyuan Bu 已提交
156 157
                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
                    pass_id, batch_id, loss_v[0], start_time - prev_start_time))
D
dangqingqing 已提交
158 159
        test(pass_id)

X
Xingyuan Bu 已提交
160
        if pass_id % 10 == 0 or pass_id == num_passes - 1:
D
dangqingqing 已提交
161 162
            model_path = os.path.join(model_save_dir, str(pass_id))
            print 'save models to %s' % (model_path)
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
            fluid.io.save_persistables(exe, model_path)


def parallel_exe(args,
                 train_file_list,
                 val_file_list,
                 data_args,
                 learning_rate,
                 batch_size,
                 num_passes,
                 model_save_dir='model',
                 pretrained_model=None):
    image_shape = [3, data_args.resize_h, data_args.resize_w]
    if data_args.dataset == 'coco':
        num_classes = 81
    elif data_args.dataset == 'pascalvoc':
        num_classes = 21

D
Dang Qingqing 已提交
181 182 183
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))

184 185 186 187 188 189 190 191 192 193 194
    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    gt_box = fluid.layers.data(
        name='gt_box', shape=[4], dtype='float32', lod_level=1)
    gt_label = fluid.layers.data(
        name='gt_label', shape=[1], dtype='int32', lod_level=1)
    difficult = fluid.layers.data(
        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)

    locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
    nmsed_out = fluid.layers.detection_output(
        locs, confs, box, box_var, nms_threshold=0.45)
D
Dang Qingqing 已提交
195
    loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, box_var)
196 197 198 199 200 201 202 203 204 205 206 207
    loss = fluid.layers.reduce_sum(loss)

    test_program = fluid.default_main_program().clone(for_test=True)
    with fluid.program_guard(test_program):
        map_eval = fluid.evaluator.DetectionMAP(
            nmsed_out,
            gt_label,
            gt_box,
            difficult,
            num_classes,
            overlap_threshold=0.5,
            evaluate_difficult=False,
D
Dang Qingqing 已提交
208
            ap_version=args.ap_version)
209 210 211 212

    if data_args.dataset == 'coco':
        # learning rate decay in 12, 19 pass, respectively
        if '2014' in train_file_list:
D
Dang Qingqing 已提交
213 214
            epocs = 82783 / batch_size
            boundaries = [epocs * 12, epocs * 19]
215
        elif '2017' in train_file_list:
D
Dang Qingqing 已提交
216 217
            epocs = 118287 / batch_size
            boundaries = [epcos * 12, epocs * 19]
218
    elif data_args.dataset == 'pascalvoc':
D
Dang Qingqing 已提交
219 220
        epocs = 19200 / batch_size
        boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
D
Dang Qingqing 已提交
221 222 223 224
    values = [
        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
        learning_rate * 0.1, learning_rate * 0.01
    ]
225 226 227 228 229
    optimizer = fluid.optimizer.RMSProp(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
        regularization=fluid.regularizer.L2Decay(0.00005), )

    optimizer.minimize(loss)
D
dangqingqing 已提交
230

231 232 233 234 235
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if pretrained_model:
D
Dang Qingqing 已提交
236

237 238
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
D
Dang Qingqing 已提交
239

240 241
        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

D
Dang Qingqing 已提交
242 243
    train_exe = fluid.ParallelExecutor(
        use_cuda=args.use_gpu, loss_name=loss.name)
244 245 246 247 248 249 250 251

    train_reader = paddle.batch(
        reader.train(data_args, train_file_list), batch_size=batch_size)
    test_reader = paddle.batch(
        reader.test(data_args, val_file_list), batch_size=batch_size)
    feeder = fluid.DataFeeder(
        place=place, feed_list=[image, gt_box, gt_label, difficult])

D
Dang Qingqing 已提交
252 253 254 255 256 257 258 259
    def save_model(postfix):
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        print 'save models to %s' % (model_path)
        fluid.io.save_persistables(exe, model_path)

    best_map = 0.
D
Dang Qingqing 已提交
260

D
Dang Qingqing 已提交
261
    def test(pass_id, best_map):
262 263 264 265 266 267 268
        _, accum_map = map_eval.get_map_var()
        map_eval.reset(exe)
        test_map = None
        for _, data in enumerate(test_reader()):
            test_map = exe.run(test_program,
                               feed=feeder.feed(data),
                               fetch_list=[accum_map])
D
Dang Qingqing 已提交
269 270 271
        if test_map[0] > best_map:
            best_map = test_map[0]
            save_model('best_model')
272 273 274 275 276 277 278 279 280
        print("Test {0}, map {1}".format(pass_id, test_map[0]))

    for pass_id in range(num_passes):
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
        for batch_id, data in enumerate(train_reader()):
            prev_start_time = start_time
            start_time = time.time()
D
Dang Qingqing 已提交
281
            if len(data) < devices_num: continue
282
            loss_v, = train_exe.run(fetch_list=[loss.name],
D
Dang Qingqing 已提交
283
                                    feed_dict=feeder.feed(data))
284 285 286 287 288
            end_time = time.time()
            loss_v = np.mean(np.array(loss_v))
            if batch_id % 20 == 0:
                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
                    pass_id, batch_id, loss_v, start_time - prev_start_time))
D
Dang Qingqing 已提交
289
        test(pass_id, best_map)
290
        if pass_id % 10 == 0 or pass_id == num_passes - 1:
D
Dang Qingqing 已提交
291 292
            save_model(str(pass_id))
    print("Best test map {0}".format(best_map))
D
dangqingqing 已提交
293

D
Dang Qingqing 已提交
294

D
dangqingqing 已提交
295
if __name__ == '__main__':
296 297
    args = parser.parse_args()
    print_arguments(args)
298 299 300 301 302 303 304 305 306 307 308 309

    data_dir = 'data/pascalvoc'
    train_file_list = 'trainval.txt'
    val_file_list = 'test.txt'
    label_file = 'label_list'
    model_save_dir = args.model_save_dir
    if args.dataset == 'coco':
        data_dir = './data/COCO17'
        train_file_list = 'annotations/instances_train2017.json'
        val_file_list = 'annotations/instances_val2017.json'
        label_file = 'label_list'

D
dangqingqing 已提交
310
    data_args = reader.Settings(
311 312 313
        dataset=args.dataset,
        data_dir=data_dir,
        label_file=label_file,
X
Xingyuan Bu 已提交
314 315 316 317
        apply_distort=args.apply_distort,
        apply_expand=args.apply_expand,
        resize_h=args.resize_h,
        resize_w=args.resize_w,
D
Dang Qingqing 已提交
318 319
        mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R],
        toy=args.is_toy)
320 321
    #method = parallel_do
    method = parallel_exe
D
Dang Qingqing 已提交
322 323 324 325 326 327 328 329 330 331
    method(
        args,
        train_file_list=train_file_list,
        val_file_list=val_file_list,
        data_args=data_args,
        learning_rate=args.learning_rate,
        batch_size=args.batch_size,
        num_passes=args.num_passes,
        model_save_dir=model_save_dir,
        pretrained_model=args.pretrained_model)