train.py 11.1 KB
Newer Older
X
xiaoting 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
B
baiyfbupt 已提交
14 15 16 17
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

18
import os
19
import shutil
20
import numpy as np
21
import time
22 23 24
import argparse
import functools

25 26 27 28 29 30 31 32 33 34 35 36 37 38

def set_paddle_flags(**kwargs):
    for key, value in kwargs.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags(
    FLAGS_eager_delete_tensor_gb=0,  # enable GC to save memory
)

39
import paddle
40 41
import paddle.fluid as fluid
from pyramidbox import PyramidBox
Q
qingqing01 已提交
42
import reader
L
LielinJiang 已提交
43
from utility import add_arguments, print_arguments, check_cuda
44 45 46

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
47

48
# yapf: disable
Q
qingqing01 已提交
49 50
add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
51
add_arg('batch_size',       int,   16,              "Minibatch size.")
52
add_arg('epoc_num',         int,   160,             "Epoch number.")
Q
qingqing01 已提交
53 54 55 56
add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
add_arg('model_save_dir',   str,   'output',        "The path to save model.")
add_arg('resize_h',         int,   640,             "The resized image height.")
Q
qingqing01 已提交
57
add_arg('resize_w',         int,   640,             "The resized image width.")
58
add_arg('mean_BGR',         str,   '104., 117., 123.', "Mean value for B,G,R channel which will be subtracted.")
Q
qingqing01 已提交
59
add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
B
baiyfbupt 已提交
60
add_arg('data_dir',         str,   'data',          "The base dir of dataset")
61
add_arg('use_multiprocess', bool,  True,            "Whether use multi-process for data preprocessing.")
Z
zhengya01 已提交
62 63 64
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--batch_num', type=int, help="batch num for ce")
parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
65 66
#yapf: enable

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
train_parameters = {
    "train_images": 12880,
    "image_shape": [3, 640, 640],
    "class_num": 2,
    "batch_size": 16,
    "lr": 0.001,
    "lr_epochs": [99, 124, 149],
    "lr_decay": [1, 0.1, 0.01, 0.001],
    "epoc_num": 160,
    "optimizer_method": "momentum",
    "use_pyramidbox": True
}

def optimizer_setting(train_params):
    batch_size = train_params["batch_size"]
    iters = train_params["train_images"] // batch_size
    lr = train_params["lr"]
    optimizer_method = train_params["optimizer_method"]
    boundaries = [i * iters for i in train_params["lr_epochs"]]
    values = [i * lr for i in train_params["lr_decay"]]
87

Q
qingqing01 已提交
88 89
    if optimizer_method == "momentum":
        optimizer = fluid.optimizer.Momentum(
90
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
Q
qingqing01 已提交
91 92 93 94 95 96 97 98
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
    else:
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
    return optimizer


def build_program(train_params, main_prog, startup_prog, args):
    use_pyramidbox = train_params["use_pyramidbox"]
    image_shape = train_params["image_shape"]
    class_num = train_params["class_num"]
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=8,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "float32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
            fetches = []
            network = PyramidBox(image=image,
                                 face_box=face_box,
                                 head_box=head_box,
                                 gt_label=gt_label,
                                 sub_network=use_pyramidbox)
            if use_pyramidbox:
                face_loss, head_loss, loss = network.train()
                fetches = [face_loss, head_loss]
            else:
                loss = network.vgg_ssd_loss()
                fetches = [loss]
            optimizer = optimizer_setting(train_params)
            optimizer.minimize(loss)
    return py_reader, fetches, loss

def train(args, config, train_params, train_file_list):
    batch_size = train_params["batch_size"]
    epoc_num = train_params["epoc_num"]
    optimizer_method = train_params["optimizer_method"]
    use_pyramidbox = train_params["use_pyramidbox"]

    use_gpu = args.use_gpu
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    batch_size_per_device = batch_size // devices_num
    iters_per_epoc = train_params["train_images"] // batch_size
    num_workers = 8
    is_shuffle = True

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

Z
zhengya01 已提交
151 152 153 154 155 156 157 158 159 160
    #only for ce
    if args.enable_ce:
        SEED = 102
        startup_prog.random_seed = SEED
        train_prog.random_seed = SEED
        num_workers = 1
        pretrained_model = ""
        if args.batch_num != None:
            iters_per_epoc = args.batch_num

161 162 163 164 165
    train_py_reader, fetches, loss = build_program(
        train_params = train_params,
        main_prog = train_prog,
        startup_prog = startup_prog,
        args=args)
166

Q
qingqing01 已提交
167
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
168
    exe = fluid.Executor(place)
169
    exe.run(startup_prog)
170

171
    start_epoc = 0
172
    if pretrained_model:
173
        if pretrained_model.isdigit():
174
            start_epoc = int(pretrained_model) + 1
Q
qingqing01 已提交
175
            pretrained_model = os.path.join(model_save_dir, pretrained_model)
176 177
            print("Resume from %s " %(pretrained_model))

178 179 180
        if not os.path.exists(pretrained_model):
            raise ValueError("The pre-trained model path [%s] does not exist." %
                             (pretrained_model))
181 182
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
183 184 185 186 187 188
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
    train_reader = reader.train(config,
                                train_file_list,
                                batch_size_per_device,
                                shuffle = is_shuffle,
189 190
                                use_multiprocess=args.use_multiprocess,
                                num_workers=num_workers)
191
    train_py_reader.decorate_paddle_reader(train_reader)
192

193 194
    if args.parallel:
        train_exe = fluid.ParallelExecutor(
195 196 197
            main_program = train_prog,
            use_cuda=use_gpu,
            loss_name=loss.name)
198

199
    def save_model(postfix, program):
200 201 202
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
203

B
baiyf 已提交
204
        print('save models to %s' % (model_path))
Q
qingqing01 已提交
205
        fluid.io.save_persistables(exe, model_path, main_program=program)
206

207 208 209 210 211 212 213 214 215 216 217 218 219
    total_time = 0.0
    epoch_idx = 0
    face_loss = 0
    head_loss = 0
    for pass_id in range(start_epoc, epoc_num):
        epoch_idx += 1
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
        batch_id = 0
        train_py_reader.start()
        while True:
            try:
220 221 222 223 224
                prev_start_time = start_time
                start_time = time.time()
                if args.parallel:
                    fetch_vars = train_exe.run(fetch_list=
                        [v.name for v in fetches])
225
                else:
226
                    fetch_vars = exe.run(train_prog, fetch_list=fetches)
227 228
                end_time = time.time()
                fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
229 230
                face_loss = fetch_vars[0]
                head_loss = fetch_vars[1]
231 232
                if batch_id % 10 == 0:
                    if not args.use_pyramidbox:
233
                        print("Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
234
                            pass_id, batch_id, face_loss,
235 236
                            start_time - prev_start_time))
                    else:
237 238 239
                        print("Pass {:d}, batch {:d}, face loss {:.6f}, " \
                              "head loss {:.6f}, " \
                              "time {:.5f}".format(pass_id,
240
                               batch_id, face_loss, head_loss,
241
                               start_time - prev_start_time))
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
                batch_id += 1
            except (fluid.core.EOFException, StopIteration):
                train_py_reader.reset()
                break
        epoch_end_time = time.time()
        total_time += epoch_end_time - start_time
        save_model(str(pass_id), train_prog)

    # only for ce
    if args.enable_ce:
        gpu_num = get_cards(args)
        print("kpis\teach_pass_duration_card%s\t%s" %
                (gpu_num, total_time / epoch_idx))
        print("kpis\ttrain_face_loss_card%s\t%s" %
                (gpu_num, face_loss))
        print("kpis\ttrain_head_loss_card%s\t%s" %
                (gpu_num, head_loss))

260

Z
zhengya01 已提交
261 262 263 264 265 266 267 268 269 270

def get_cards(args):
    if args.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return args.num_devices


271 272 273
if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
L
LielinJiang 已提交
274
    check_cuda(args.use_gpu)
275

B
baiyfbupt 已提交
276 277 278
    data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
    train_file_list = os.path.join(args.data_dir,
        'wider_face_split/wider_face_train_bbx_gt.txt')
279 280 281 282 283 284 285 286
    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
    image_shape = [3, int(args.resize_h), int(args.resize_w)]
    train_parameters["image_shape"] = image_shape
    train_parameters["use_pyramidbox"] = args.use_pyramidbox
    train_parameters["batch_size"] = args.batch_size
    train_parameters["lr"] = args.learning_rate
    train_parameters["epoc_num"] = args.epoc_num

287

Q
qingqing01 已提交
288
    config = reader.Settings(
289
        data_dir=data_dir,
290 291
        resize_h=image_shape[1],
        resize_w=image_shape[2],
292
        apply_distort=True,
Q
qingqing01 已提交
293
        apply_expand=False,
294
        mean_value=mean_BGR,
295
        ap_version='11point')
296
    train(args, config, train_parameters, train_file_list)