train.py 10.7 KB
Newer Older
B
baiyfbupt 已提交
1 2 3 4
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

5
import os
6
import shutil
7
import numpy as np
8
import time
9 10 11
import argparse
import functools

12 13 14 15 16 17 18 19 20 21 22 23 24 25

def set_paddle_flags(**kwargs):
    for key, value in kwargs.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags(
    FLAGS_eager_delete_tensor_gb=0,  # enable GC to save memory
)

26
import paddle
27 28
import paddle.fluid as fluid
from pyramidbox import PyramidBox
Q
qingqing01 已提交
29
import reader
L
LielinJiang 已提交
30
from utility import add_arguments, print_arguments, check_cuda
31 32 33

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
34

35
# yapf: disable
Q
qingqing01 已提交
36 37
add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
38
add_arg('batch_size',       int,   16,              "Minibatch size.")
39
add_arg('epoc_num',         int,   160,             "Epoch number.")
Q
qingqing01 已提交
40 41 42 43
add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
add_arg('model_save_dir',   str,   'output',        "The path to save model.")
add_arg('resize_h',         int,   640,             "The resized image height.")
Q
qingqing01 已提交
44
add_arg('resize_w',         int,   640,             "The resized image width.")
45
add_arg('mean_BGR',         str,   '104., 117., 123.', "Mean value for B,G,R channel which will be subtracted.")
46
add_arg('with_mem_opt',     bool,  True,            "Whether to use memory optimization or not.")
Q
qingqing01 已提交
47
add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
B
baiyfbupt 已提交
48
add_arg('data_dir',         str,   'data',          "The base dir of dataset")
49
add_arg('use_multiprocess', bool,  True,            "Whether use multi-process for data preprocessing.")
Z
zhengya01 已提交
50 51 52
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--batch_num', type=int, help="batch num for ce")
parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
53 54
#yapf: enable

55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
train_parameters = {
    "train_images": 12880,
    "image_shape": [3, 640, 640],
    "class_num": 2,
    "batch_size": 16,
    "lr": 0.001,
    "lr_epochs": [99, 124, 149],
    "lr_decay": [1, 0.1, 0.01, 0.001],
    "epoc_num": 160,
    "optimizer_method": "momentum",
    "use_pyramidbox": True
}

def optimizer_setting(train_params):
    batch_size = train_params["batch_size"]
    iters = train_params["train_images"] // batch_size
    lr = train_params["lr"]
    optimizer_method = train_params["optimizer_method"]
    boundaries = [i * iters for i in train_params["lr_epochs"]]
    values = [i * lr for i in train_params["lr_decay"]]
75

Q
qingqing01 已提交
76 77
    if optimizer_method == "momentum":
        optimizer = fluid.optimizer.Momentum(
78
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
Q
qingqing01 已提交
79 80 81 82 83 84 85 86
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
    else:
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
    return optimizer


def build_program(train_params, main_prog, startup_prog, args):
    use_pyramidbox = train_params["use_pyramidbox"]
    image_shape = train_params["image_shape"]
    class_num = train_params["class_num"]
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=8,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "float32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
            fetches = []
            network = PyramidBox(image=image,
                                 face_box=face_box,
                                 head_box=head_box,
                                 gt_label=gt_label,
                                 sub_network=use_pyramidbox)
            if use_pyramidbox:
                face_loss, head_loss, loss = network.train()
                fetches = [face_loss, head_loss]
            else:
                loss = network.vgg_ssd_loss()
                fetches = [loss]
            optimizer = optimizer_setting(train_params)
            optimizer.minimize(loss)
    return py_reader, fetches, loss

def train(args, config, train_params, train_file_list):
    batch_size = train_params["batch_size"]
    epoc_num = train_params["epoc_num"]
    optimizer_method = train_params["optimizer_method"]
    use_pyramidbox = train_params["use_pyramidbox"]

    use_gpu = args.use_gpu
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
    with_memory_optimization = args.with_mem_opt

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    batch_size_per_device = batch_size // devices_num
    iters_per_epoc = train_params["train_images"] // batch_size
    num_workers = 8
    is_shuffle = True

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

Z
zhengya01 已提交
140 141 142 143 144 145 146 147 148 149
    #only for ce
    if args.enable_ce:
        SEED = 102
        startup_prog.random_seed = SEED
        train_prog.random_seed = SEED
        num_workers = 1
        pretrained_model = ""
        if args.batch_num != None:
            iters_per_epoc = args.batch_num

150 151 152 153 154
    train_py_reader, fetches, loss = build_program(
        train_params = train_params,
        main_prog = train_prog,
        startup_prog = startup_prog,
        args=args)
155

156
    if with_memory_optimization:
157
        fluid.memory_optimize(train_prog)
158

Q
qingqing01 已提交
159
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
160
    exe = fluid.Executor(place)
161
    exe.run(startup_prog)
162

163
    start_epoc = 0
164
    if pretrained_model:
165
        if pretrained_model.isdigit():
166
            start_epoc = int(pretrained_model) + 1
Q
qingqing01 已提交
167
            pretrained_model = os.path.join(model_save_dir, pretrained_model)
168 169
            print("Resume from %s " %(pretrained_model))

170 171 172
        if not os.path.exists(pretrained_model):
            raise ValueError("The pre-trained model path [%s] does not exist." %
                             (pretrained_model))
173 174
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
175 176 177 178 179 180
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
    train_reader = reader.train(config,
                                train_file_list,
                                batch_size_per_device,
                                shuffle = is_shuffle,
181 182
                                use_multiprocess=args.use_multiprocess,
                                num_workers=num_workers)
183
    train_py_reader.decorate_paddle_reader(train_reader)
184

185 186
    if args.parallel:
        train_exe = fluid.ParallelExecutor(
187 188 189
            main_program = train_prog,
            use_cuda=use_gpu,
            loss_name=loss.name)
190

191
    def save_model(postfix, program):
192 193 194
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
195

B
baiyf 已提交
196
        print('save models to %s' % (model_path))
Q
qingqing01 已提交
197
        fluid.io.save_persistables(exe, model_path, main_program=program)
198

199 200 201 202 203 204 205 206 207 208 209 210 211
    total_time = 0.0
    epoch_idx = 0
    face_loss = 0
    head_loss = 0
    for pass_id in range(start_epoc, epoc_num):
        epoch_idx += 1
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
        batch_id = 0
        train_py_reader.start()
        while True:
            try:
212 213 214 215 216
                prev_start_time = start_time
                start_time = time.time()
                if args.parallel:
                    fetch_vars = train_exe.run(fetch_list=
                        [v.name for v in fetches])
217
                else:
218
                    fetch_vars = exe.run(train_prog, fetch_list=fetches)
219 220
                end_time = time.time()
                fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
221 222
                face_loss = fetch_vars[0]
                head_loss = fetch_vars[1]
223 224
                if batch_id % 10 == 0:
                    if not args.use_pyramidbox:
225
                        print("Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
226
                            pass_id, batch_id, face_loss,
227 228
                            start_time - prev_start_time))
                    else:
229 230 231
                        print("Pass {:d}, batch {:d}, face loss {:.6f}, " \
                              "head loss {:.6f}, " \
                              "time {:.5f}".format(pass_id,
232
                               batch_id, face_loss, head_loss,
233
                               start_time - prev_start_time))
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
                batch_id += 1
            except (fluid.core.EOFException, StopIteration):
                train_py_reader.reset()
                break
        epoch_end_time = time.time()
        total_time += epoch_end_time - start_time
        save_model(str(pass_id), train_prog)

    # only for ce
    if args.enable_ce:
        gpu_num = get_cards(args)
        print("kpis\teach_pass_duration_card%s\t%s" %
                (gpu_num, total_time / epoch_idx))
        print("kpis\ttrain_face_loss_card%s\t%s" %
                (gpu_num, face_loss))
        print("kpis\ttrain_head_loss_card%s\t%s" %
                (gpu_num, head_loss))

252

Z
zhengya01 已提交
253 254 255 256 257 258 259 260 261 262

def get_cards(args):
    if args.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return args.num_devices


263 264 265
if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
L
LielinJiang 已提交
266
    check_cuda(args.use_gpu)
267

B
baiyfbupt 已提交
268 269 270
    data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
    train_file_list = os.path.join(args.data_dir,
        'wider_face_split/wider_face_train_bbx_gt.txt')
271 272 273 274 275 276 277 278
    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
    image_shape = [3, int(args.resize_h), int(args.resize_w)]
    train_parameters["image_shape"] = image_shape
    train_parameters["use_pyramidbox"] = args.use_pyramidbox
    train_parameters["batch_size"] = args.batch_size
    train_parameters["lr"] = args.learning_rate
    train_parameters["epoc_num"] = args.epoc_num

279

Q
qingqing01 已提交
280
    config = reader.Settings(
281
        data_dir=data_dir,
282 283
        resize_h=image_shape[1],
        resize_w=image_shape[2],
284
        apply_distort=True,
Q
qingqing01 已提交
285
        apply_expand=False,
286
        mean_value=mean_BGR,
287
        ap_version='11point')
288
    train(args, config, train_parameters, train_file_list)