train.py 11.0 KB
Newer Older
1
import os
X
Xingyuan Bu 已提交
2
import time
3 4 5
import numpy as np
import argparse
import functools
D
Dang Qingqing 已提交
6
import shutil
B
Bai Yifan 已提交
7
import math
8

D
Dang Qingqing 已提交
9 10 11 12 13 14
import paddle
import paddle.fluid as fluid
import reader
from mobilenet_ssd import mobile_net
from utility import add_arguments, print_arguments

15 16
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
17 18
# yapf: disable
add_arg('learning_rate',    float, 0.001,     "Learning rate.")
B
Bai Yifan 已提交
19
add_arg('batch_size',       int,   64,        "Minibatch size.")
B
Bai Yifan 已提交
20
add_arg('epoc_num',         int,   120,       "Epoch number.")
21
add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
X
Xingyuan Bu 已提交
22
add_arg('parallel',         bool,  True,      "Parallel.")
23 24 25
add_arg('dataset',          str,   'pascalvoc', "coco2014, coco2017, and pascalvoc.")
add_arg('model_save_dir',   str,   'model',     "The path to save model.")
add_arg('pretrained_model', str,   'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
B
Bai Yifan 已提交
26 27 28
add_arg('ap_version',       str,   '11point',           "Integral, 11point.")
add_arg('image_shape',      str,   '3,300,300',         "Input image shape.")
add_arg('mean_BGR',   str,   '127.5,127.5,127.5', "Mean value for B,G,R channel which will be subtracted.")
B
baiyf 已提交
29
add_arg('data_dir',         str,   'data/pascalvoc', "data directory")
B
kpi fix  
baiyfbupt 已提交
30
add_arg('enable_ce',     bool,  False, "Whether use CE to evaluate the model")
31
#yapf: enable
32

B
Bai Yifan 已提交
33 34
train_parameters = {
    "pascalvoc": {
B
Bai Yifan 已提交
35
        "train_images": 16551,
B
Bai Yifan 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
        "image_shape": [3, 300, 300],
        "class_num": 21,
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [40, 60, 80, 100],
        "lr_decay": [1, 0.5, 0.25, 0.1, 0.01]
    },
    "coco2014": {
        "train_images": 82783,
        "image_shape": [3, 300, 300],
        "class_num": 91,
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [12, 19],
        "lr_decay": [1, 0.5, 0.25]
    },
    "coco2017": {
        "train_images": 118287,
        "image_shape": [3, 300, 300],
        "class_num": 91,
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [12, 19],
        "lr_decay": [1, 0.5, 0.25]
    }
}

def optimizer_setting(train_params):
    batch_size = train_params["batch_size"]
65
    iters = train_params["train_images"] // batch_size
B
Bai Yifan 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    lr = train_params["lr"]
    boundaries = [i * iters  for i in train_params["lr_epochs"]]
    values = [ i * lr for i in train_params["lr_decay"]]

    optimizer = fluid.optimizer.RMSProp(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
        regularization=fluid.regularizer.L2Decay(0.00005), )

    return optimizer


def build_program(main_prog, startup_prog, train_params, is_train):
    image_shape = train_params['image_shape']
    class_num = train_params['class_num']
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=64,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 1], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "int32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, gt_box, gt_label, difficult = fluid.layers.read_file(py_reader)
            locs, confs, box, box_var = mobile_net(class_num, image, image_shape)
            if is_train:
                loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
                    box_var)
                loss = fluid.layers.reduce_sum(loss)
                optimizer = optimizer_setting(train_params)
                optimizer.minimize(loss)
            else:

                nmsed_out = fluid.layers.detection_output(
                    locs, confs, box, box_var, nms_threshold=0.45)
                with fluid.program_guard(main_prog):
                    loss = fluid.evaluator.DetectionMAP(
                        nmsed_out,
                        gt_label,
                        gt_box,
                        difficult,
                        class_num,
                        overlap_threshold=0.5,
                        evaluate_difficult=False,
                        ap_version=args.ap_version)
    return py_reader, loss


X
Xingyuan Bu 已提交
113 114
def train(args,
          data_args,
B
Bai Yifan 已提交
115 116 117 118 119 120 121 122 123 124
          train_params,
          train_file_list,
          val_file_list):

    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
    use_gpu = args.use_gpu
    parallel = args.parallel
    enable_ce = args.enable_ce
    is_shuffle = True
125

D
Dang Qingqing 已提交
126 127
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
B
Bai Yifan 已提交
128
    batch_size = train_params['batch_size']
Q
qingqing01 已提交
129
    epoc_num = train_params['epoc_num']
B
Bai Yifan 已提交
130 131 132
    batch_size_per_device = batch_size // devices_num
    iters_per_epoc = train_params["train_images"] // batch_size
    num_workers = 8
B
Bai Yifan 已提交
133

B
Bai Yifan 已提交
134 135 136 137 138 139 140 141 142 143 144 145
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    if enable_ce:
        import random
        random.seed(0)
        np.random.seed(0)
        is_shuffle = False
        startup_prog.random_seed = 111
        train_prog.random_seed = 111
        test_prog.random_seed = 111
146

B
Bai Yifan 已提交
147 148 149 150 151 152 153 154 155 156
    train_py_reader, loss = build_program(
        main_prog=train_prog,
        startup_prog=startup_prog,
        train_params=train_params,
        is_train=True)
    test_py_reader, map_eval = build_program(
        main_prog=test_prog,
        startup_prog=startup_prog,
        train_params=train_params,
        is_train=False)
D
dangqingqing 已提交
157

B
Bai Yifan 已提交
158 159
    test_prog = test_prog.clone(for_test=True)
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
160
    exe = fluid.Executor(place)
B
Bai Yifan 已提交
161
    exe.run(startup_prog)
162 163 164 165

    if pretrained_model:
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
B
Bai Yifan 已提交
166 167
        fluid.io.load_vars(exe, pretrained_model, main_program=train_prog,
                           predicate=if_exist)
168

B
Bai Yifan 已提交
169 170 171 172 173 174 175 176 177
    if parallel:
        train_exe = fluid.ParallelExecutor(main_program=train_prog,
            use_cuda=use_gpu, loss_name=loss.name)
    train_reader = reader.train(data_args,
                                train_file_list,
                                batch_size_per_device,
                                shuffle=is_shuffle,
                                use_multiprocessing=True,
                                num_workers=num_workers,
B
Bai Yifan 已提交
178 179
                                max_queue=24,
                                enable_ce=enable_ce)
B
Bai Yifan 已提交
180 181 182 183 184
    test_reader = reader.test(data_args, val_file_list, batch_size)
    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)

    def save_model(postfix, main_prog):
D
Dang Qingqing 已提交
185 186 187
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
M
minqiyang 已提交
188
        print('save models to %s' % (model_path))
B
Bai Yifan 已提交
189
        fluid.io.save_persistables(exe, model_path, main_program=main_prog)
D
Dang Qingqing 已提交
190 191

    best_map = 0.
B
Bai Yifan 已提交
192
    def test(epoc_id, best_map):
193 194
        _, accum_map = map_eval.get_map_var()
        map_eval.reset(exe)
B
Bai Yifan 已提交
195 196 197 198 199 200 201 202 203 204 205 206 207 208
        every_epoc_map=[]
        test_py_reader.start()
        try:
            batch_id = 0
            while True:
                test_map, = exe.run(test_prog, fetch_list=[accum_map])
                if batch_id % 10 == 0:
                    every_epoc_map.append(test_map)
                    print("Batch {0}, map {1}".format(batch_id, test_map))
                batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()
        mean_map = np.mean(every_epoc_map)
        print("Epoc {0}, test map {1}".format(epoc_id, test_map))
D
Dang Qingqing 已提交
209 210
        if test_map[0] > best_map:
            best_map = test_map[0]
B
Bai Yifan 已提交
211
            save_model('best_model', test_prog)
B
kpi fix  
baiyfbupt 已提交
212
        return best_map, mean_map
B
baiyfbupt 已提交
213

B
Bai Yifan 已提交
214 215 216 217 218 219

    train_py_reader.start()
    total_time = 0.0
    try:
        for epoc_id in range(epoc_num):
            epoch_idx = epoc_id + 1
B
Bai Yifan 已提交
220
            start_time = time.time()
B
Bai Yifan 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
            prev_start_time = start_time
            every_epoc_loss = []
            for batch_id in range(iters_per_epoc):
                prev_start_time = start_time
                start_time = time.time()
                if parallel:
                    loss_v, = train_exe.run(fetch_list=[loss.name])
                else:
                    loss_v, = exe.run(train_prog, fetch_list=[loss])
                loss_v = np.mean(np.array(loss_v))
                every_epoc_loss.append(loss_v)
                if batch_id % 20 == 0:
                    print("Epoc {0}, batch {1}, loss {2}, time {3}".format(
                        epoc_id, batch_id, loss_v, start_time - prev_start_time))
            end_time = time.time()
            total_time += end_time - start_time

            best_map, mean_map = test(epoc_id, best_map)
            print("Best test map {0}".format(best_map))
            if epoc_id % 10 == 0 or epoc_id == epoc_num - 1:
                save_model(str(epoc_id), train_prog)

            if enable_ce and epoc_id == epoc_num - 1:
                train_avg_loss = np.mean(every_epoc_loss)
                if devices_num == 1:
                    print("kpis	train_cost	%s" % train_avg_loss)
                    print("kpis	test_acc	%s" % mean_map)
                    print("kpis	train_speed	%s" % (total_time / epoch_idx))
                else:
                    print("kpis	train_cost_card%s	%s" %
                           (devices_num, train_avg_loss))
                    print("kpis	test_acc_card%s	%s" %
                           (devices_num, mean_map))
                    print("kpis	train_speed_card%s	%f" %
                           (devices_num, total_time / epoch_idx))
B
Bai Yifan 已提交
256

B
Bai Yifan 已提交
257 258 259 260 261
    except fluid.core.EOFException:
        train_py_reader.reset()
    except StopIteration:
        train_py_reader.reset()
    train_py_reader.reset()
B
baiyfbupt 已提交
262

D
dangqingqing 已提交
263 264

if __name__ == '__main__':
265 266
    args = parser.parse_args()
    print_arguments(args)
267

B
baiyf 已提交
268
    data_dir = args.data_dir
B
Bai Yifan 已提交
269 270 271 272
    dataset = args.dataset
    assert dataset in ['pascalvoc', 'coco2014', 'coco2017']

    # for pascalvoc
273
    label_file = 'label_list'
B
baiyf 已提交
274 275
    train_file_list = 'trainval.txt'
    val_file_list = 'test.txt'
B
Bai Yifan 已提交
276 277 278 279 280 281 282 283 284 285 286 287 288 289

    if dataset == 'coco2014':
        train_file_list = 'annotations/instances_train2014.json'
        val_file_list = 'annotations/instances_val2014.json'
    elif dataset == 'coco2017':
        train_file_list = 'annotations/instances_train2017.json'
        val_file_list = 'annotations/instances_val2017.json'

    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
    image_shape = [int(m) for m in args.image_shape.split(",")]
    train_parameters[dataset]['image_shape'] = image_shape
    train_parameters[dataset]['batch_size'] = args.batch_size
    train_parameters[dataset]['lr'] = args.learning_rate
    train_parameters[dataset]['epoc_num'] = args.epoc_num
290

D
dangqingqing 已提交
291
    data_args = reader.Settings(
292 293 294
        dataset=args.dataset,
        data_dir=data_dir,
        label_file=label_file,
B
Bai Yifan 已提交
295 296 297 298 299 300 301 302 303 304 305
        resize_h=image_shape[1],
        resize_w=image_shape[2],
        mean_value=mean_BGR,
        apply_distort=True,
        apply_expand=True,
        ap_version = args.ap_version)
    train(args,
          data_args,
          train_parameters[dataset],
          train_file_list=train_file_list,
          val_file_list=val_file_list)