train.py 11.7 KB
Newer Older
1
import os
X
Xingyuan Bu 已提交
2
import time
3 4 5
import numpy as np
import argparse
import functools
D
Dang Qingqing 已提交
6
import shutil
B
Bai Yifan 已提交
7
import math
8
import multiprocessing
9

D
Dang Qingqing 已提交
10 11 12 13 14 15
import paddle
import paddle.fluid as fluid
import reader
from mobilenet_ssd import mobile_net
from utility import add_arguments, print_arguments

16 17
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
18 19
# yapf: disable
add_arg('learning_rate',    float, 0.001,     "Learning rate.")
20
add_arg('batch_size',       int,   64,        "Minibatch size of all devices.")
B
Bai Yifan 已提交
21
add_arg('epoc_num',         int,   120,       "Epoch number.")
22
add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
23 24
add_arg('parallel',         bool,  True,      "Whether train in parallel on multi-devices.")
add_arg('dataset',          str,   'pascalvoc', "dataset can be coco2014, coco2017, and pascalvoc.")
25 26
add_arg('model_save_dir',   str,   'model',     "The path to save model.")
add_arg('pretrained_model', str,   'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
27
add_arg('ap_version',       str,   '11point',           "mAP version can be integral or 11point.")
B
Bai Yifan 已提交
28
add_arg('image_shape',      str,   '3,300,300',         "Input image shape.")
29 30 31
add_arg('mean_BGR',         str,   '127.5,127.5,127.5', "Mean value for B,G,R channel which will be subtracted.")
add_arg('data_dir',         str,   'data/pascalvoc', "Data directory.")
add_arg('enable_ce',        bool,  False, "Whether use CE to evaluate the model.")
32
#yapf: enable
33

B
Bai Yifan 已提交
34 35
train_parameters = {
    "pascalvoc": {
B
Bai Yifan 已提交
36
        "train_images": 16551,
B
Bai Yifan 已提交
37 38 39 40 41
        "image_shape": [3, 300, 300],
        "class_num": 21,
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [40, 60, 80, 100],
42 43
        "lr_decay": [1, 0.5, 0.25, 0.1, 0.01],
        "ap_version": '11point',
B
Bai Yifan 已提交
44 45 46 47 48 49 50 51
    },
    "coco2014": {
        "train_images": 82783,
        "image_shape": [3, 300, 300],
        "class_num": 91,
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [12, 19],
52 53
        "lr_decay": [1, 0.5, 0.25],
        "ap_version": 'integral', # should use eval_coco_map.py to test model
B
Bai Yifan 已提交
54 55 56 57 58 59 60 61
    },
    "coco2017": {
        "train_images": 118287,
        "image_shape": [3, 300, 300],
        "class_num": 91,
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [12, 19],
62 63
        "lr_decay": [1, 0.5, 0.25],
        "ap_version": 'integral', # should use eval_coco_map.py to test model
B
Bai Yifan 已提交
64 65 66 67 68
    }
}

def optimizer_setting(train_params):
    batch_size = train_params["batch_size"]
69
    iters = train_params["train_images"] // batch_size
B
Bai Yifan 已提交
70 71 72 73 74 75 76 77 78 79 80 81 82 83
    lr = train_params["lr"]
    boundaries = [i * iters  for i in train_params["lr_epochs"]]
    values = [ i * lr for i in train_params["lr_decay"]]

    optimizer = fluid.optimizer.RMSProp(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
        regularization=fluid.regularizer.L2Decay(0.00005), )

    return optimizer


def build_program(main_prog, startup_prog, train_params, is_train):
    image_shape = train_params['image_shape']
    class_num = train_params['class_num']
84
    ap_version = train_params['ap_version']
85
    outs = []
B
Bai Yifan 已提交
86 87 88 89 90 91 92 93 94 95 96
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=64,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 1], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "int32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, gt_box, gt_label, difficult = fluid.layers.read_file(py_reader)
            locs, confs, box, box_var = mobile_net(class_num, image, image_shape)
            if is_train:
97 98 99 100 101 102
                with fluid.unique_name.guard("train"):
                    loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
                        box_var)
                    loss = fluid.layers.reduce_sum(loss)
                    optimizer = optimizer_setting(train_params)
                    optimizer.minimize(loss)
103
                outs = [py_reader, loss]
B
Bai Yifan 已提交
104
            else:
105 106 107
                with fluid.unique_name.guard("inference"):
                    nmsed_out = fluid.layers.detection_output(
                        locs, confs, box, box_var, nms_threshold=0.45)
108
                    map_eval = fluid.metrics.DetectionMAP(
109 110 111 112 113 114 115 116
                        nmsed_out,
                        gt_label,
                        gt_box,
                        difficult,
                        class_num,
                        overlap_threshold=0.5,
                        evaluate_difficult=False,
                        ap_version=ap_version)
117 118 119
                # nmsed_out and image is used to save mode for inference
                outs = [py_reader, map_eval, nmsed_out, image]
    return outs
B
Bai Yifan 已提交
120 121


X
Xingyuan Bu 已提交
122 123
def train(args,
          data_args,
B
Bai Yifan 已提交
124 125 126 127 128 129 130 131 132 133
          train_params,
          train_file_list,
          val_file_list):

    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
    use_gpu = args.use_gpu
    parallel = args.parallel
    enable_ce = args.enable_ce
    is_shuffle = True
134

135 136 137 138 139 140
    if not use_gpu:
        devices_num = int(os.environ.get('CPU_NUM',
                          multiprocessing.cpu_count()))
    else:
        devices_num = fluid.core.get_cuda_device_count()

B
Bai Yifan 已提交
141
    batch_size = train_params['batch_size']
Q
qingqing01 已提交
142
    epoc_num = train_params['epoc_num']
B
Bai Yifan 已提交
143 144
    batch_size_per_device = batch_size // devices_num
    num_workers = 8
B
Bai Yifan 已提交
145

B
Bai Yifan 已提交
146 147 148 149 150 151 152 153 154 155 156 157
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    if enable_ce:
        import random
        random.seed(0)
        np.random.seed(0)
        is_shuffle = False
        startup_prog.random_seed = 111
        train_prog.random_seed = 111
        test_prog.random_seed = 111
158

B
Bai Yifan 已提交
159 160 161 162 163
    train_py_reader, loss = build_program(
        main_prog=train_prog,
        startup_prog=startup_prog,
        train_params=train_params,
        is_train=True)
164
    test_py_reader, map_eval, _, _ = build_program(
B
Bai Yifan 已提交
165 166 167 168
        main_prog=test_prog,
        startup_prog=startup_prog,
        train_params=train_params,
        is_train=False)
D
dangqingqing 已提交
169

B
Bai Yifan 已提交
170 171
    test_prog = test_prog.clone(for_test=True)
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
172
    exe = fluid.Executor(place)
B
Bai Yifan 已提交
173
    exe.run(startup_prog)
174 175 176 177

    if pretrained_model:
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
B
Bai Yifan 已提交
178 179
        fluid.io.load_vars(exe, pretrained_model, main_program=train_prog,
                           predicate=if_exist)
180

B
Bai Yifan 已提交
181
    if parallel:
182 183 184 185
        loss.persistable = True
        build_strategy = fluid.BuildStrategy()
        build_strategy.enable_inplace = True
        build_strategy.memory_optimize = True
B
Bai Yifan 已提交
186
        train_exe = fluid.ParallelExecutor(main_program=train_prog,
187
            use_cuda=use_gpu, loss_name=loss.name, build_strategy=build_strategy)
B
Bai Yifan 已提交
188 189 190 191 192
    train_reader = reader.train(data_args,
                                train_file_list,
                                batch_size_per_device,
                                shuffle=is_shuffle,
                                num_workers=num_workers,
B
Bai Yifan 已提交
193
                                enable_ce=enable_ce)
B
Bai Yifan 已提交
194 195 196 197 198
    test_reader = reader.test(data_args, val_file_list, batch_size)
    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)

    def save_model(postfix, main_prog):
D
Dang Qingqing 已提交
199 200 201
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
M
minqiyang 已提交
202
        print('save models to %s' % (model_path))
B
Bai Yifan 已提交
203
        fluid.io.save_persistables(exe, model_path, main_program=main_prog)
D
Dang Qingqing 已提交
204 205

    best_map = 0.
B
Bai Yifan 已提交
206
    def test(epoc_id, best_map):
207 208
        _, accum_map = map_eval.get_map_var()
        map_eval.reset(exe)
209
        every_epoc_map=[] # for CE
B
Bai Yifan 已提交
210 211 212 213 214 215 216 217 218 219 220 221
        test_py_reader.start()
        try:
            batch_id = 0
            while True:
                test_map, = exe.run(test_prog, fetch_list=[accum_map])
                if batch_id % 10 == 0:
                    every_epoc_map.append(test_map)
                    print("Batch {0}, map {1}".format(batch_id, test_map))
                batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()
        mean_map = np.mean(every_epoc_map)
222
        print("Epoc {0}, test map {1}".format(epoc_id, test_map[0]))
D
Dang Qingqing 已提交
223 224
        if test_map[0] > best_map:
            best_map = test_map[0]
B
Bai Yifan 已提交
225
            save_model('best_model', test_prog)
B
kpi fix  
baiyfbupt 已提交
226
        return best_map, mean_map
B
baiyfbupt 已提交
227

B
Bai Yifan 已提交
228 229

    total_time = 0.0
230 231 232 233 234 235 236 237 238
    for epoc_id in range(epoc_num):
        epoch_idx = epoc_id + 1
        start_time = time.time()
        prev_start_time = start_time
        every_epoc_loss = []
        batch_id = 0
        train_py_reader.start()
        while True:
            try:
B
Bai Yifan 已提交
239 240 241 242 243 244 245 246
                prev_start_time = start_time
                start_time = time.time()
                if parallel:
                    loss_v, = train_exe.run(fetch_list=[loss.name])
                else:
                    loss_v, = exe.run(train_prog, fetch_list=[loss])
                loss_v = np.mean(np.array(loss_v))
                every_epoc_loss.append(loss_v)
247
                if batch_id % 10 == 0:
248
                    print("Epoc {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
B
Bai Yifan 已提交
249
                        epoc_id, batch_id, loss_v, start_time - prev_start_time))
250 251 252 253 254
                batch_id += 1
            except (fluid.core.EOFException, StopIteration):
                train_reader().close()
                train_py_reader.reset()
                break
B
Bai Yifan 已提交
255

256 257 258 259 260 261
        end_time = time.time()
        total_time += end_time - start_time
        best_map, mean_map = test(epoc_id, best_map)
        print("Best test map {0}".format(best_map))
        if epoc_id % 10 == 0 or epoc_id == epoc_num - 1:
            save_model(str(epoc_id), train_prog)
B
Bai Yifan 已提交
262

263 264 265 266 267 268 269 270 271 272 273 274 275
    if enable_ce:
        train_avg_loss = np.mean(every_epoc_loss)
        if devices_num == 1:
            print("kpis	train_cost	%s" % train_avg_loss)
            print("kpis	test_acc	%s" % mean_map)
            print("kpis	train_speed	%s" % (total_time / epoch_idx))
        else:
            print("kpis	train_cost_card%s	%s" %
                   (devices_num, train_avg_loss))
            print("kpis	test_acc_card%s	%s" %
                   (devices_num, mean_map))
            print("kpis	train_speed_card%s	%f" %
                   (devices_num, total_time / epoch_idx))
B
baiyfbupt 已提交
276

D
dangqingqing 已提交
277 278

if __name__ == '__main__':
279 280
    args = parser.parse_args()
    print_arguments(args)
281

B
baiyf 已提交
282
    data_dir = args.data_dir
B
Bai Yifan 已提交
283 284 285 286
    dataset = args.dataset
    assert dataset in ['pascalvoc', 'coco2014', 'coco2017']

    # for pascalvoc
287
    label_file = 'label_list'
B
baiyf 已提交
288 289
    train_file_list = 'trainval.txt'
    val_file_list = 'test.txt'
B
Bai Yifan 已提交
290 291 292 293 294 295 296 297 298 299 300 301 302 303

    if dataset == 'coco2014':
        train_file_list = 'annotations/instances_train2014.json'
        val_file_list = 'annotations/instances_val2014.json'
    elif dataset == 'coco2017':
        train_file_list = 'annotations/instances_train2017.json'
        val_file_list = 'annotations/instances_val2017.json'

    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
    image_shape = [int(m) for m in args.image_shape.split(",")]
    train_parameters[dataset]['image_shape'] = image_shape
    train_parameters[dataset]['batch_size'] = args.batch_size
    train_parameters[dataset]['lr'] = args.learning_rate
    train_parameters[dataset]['epoc_num'] = args.epoc_num
304
    train_parameters[dataset]['ap_version'] = args.ap_version
305

D
dangqingqing 已提交
306
    data_args = reader.Settings(
307 308 309
        dataset=args.dataset,
        data_dir=data_dir,
        label_file=label_file,
B
Bai Yifan 已提交
310 311 312 313 314 315 316 317 318 319 320
        resize_h=image_shape[1],
        resize_w=image_shape[2],
        mean_value=mean_BGR,
        apply_distort=True,
        apply_expand=True,
        ap_version = args.ap_version)
    train(args,
          data_args,
          train_parameters[dataset],
          train_file_list=train_file_list,
          val_file_list=val_file_list)