train.py 11.1 KB
Newer Older
X
xiaoting 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
B
baiyfbupt 已提交
14 15 16 17
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

18
import os
19
import shutil
20
import numpy as np
21
import time
22 23 24
import argparse
import functools

25 26 27 28 29 30 31 32 33 34 35 36 37 38

def set_paddle_flags(**kwargs):
    for key, value in kwargs.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags(
    FLAGS_eager_delete_tensor_gb=0,  # enable GC to save memory
)

39
import paddle
40 41
import paddle.fluid as fluid
from pyramidbox import PyramidBox
Q
qingqing01 已提交
42
import reader
L
LielinJiang 已提交
43
from utility import add_arguments, print_arguments, check_cuda
44 45 46

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
47

48
# yapf: disable
Q
qingqing01 已提交
49 50
add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
51
add_arg('batch_size',       int,   16,              "Minibatch size.")
52
add_arg('epoc_num',         int,   160,             "Epoch number.")
Q
qingqing01 已提交
53 54 55 56
add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
add_arg('model_save_dir',   str,   'output',        "The path to save model.")
add_arg('resize_h',         int,   640,             "The resized image height.")
Q
qingqing01 已提交
57
add_arg('resize_w',         int,   640,             "The resized image width.")
58
add_arg('mean_BGR',         str,   '104., 117., 123.', "Mean value for B,G,R channel which will be subtracted.")
Q
qingqing01 已提交
59
add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
B
baiyfbupt 已提交
60
add_arg('data_dir',         str,   'data',          "The base dir of dataset")
61
add_arg('use_multiprocess', bool,  True,            "Whether use multi-process for data preprocessing.")
Z
zhengya01 已提交
62 63 64
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--batch_num', type=int, help="batch num for ce")
parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
65 66
#yapf: enable

67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
train_parameters = {
    "train_images": 12880,
    "image_shape": [3, 640, 640],
    "class_num": 2,
    "batch_size": 16,
    "lr": 0.001,
    "lr_epochs": [99, 124, 149],
    "lr_decay": [1, 0.1, 0.01, 0.001],
    "epoc_num": 160,
    "optimizer_method": "momentum",
    "use_pyramidbox": True
}

def optimizer_setting(train_params):
    batch_size = train_params["batch_size"]
    iters = train_params["train_images"] // batch_size
    lr = train_params["lr"]
    optimizer_method = train_params["optimizer_method"]
    boundaries = [i * iters for i in train_params["lr_epochs"]]
    values = [i * lr for i in train_params["lr_decay"]]
87

Q
qingqing01 已提交
88 89
    if optimizer_method == "momentum":
        optimizer = fluid.optimizer.Momentum(
90
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
Q
qingqing01 已提交
91 92 93 94 95 96 97 98
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
    else:
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
    return optimizer


def build_program(train_params, main_prog, startup_prog, args):
    use_pyramidbox = train_params["use_pyramidbox"]
    image_shape = train_params["image_shape"]
    class_num = train_params["class_num"]
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=8,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "float32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
            fetches = []
            network = PyramidBox(image=image,
                                 face_box=face_box,
                                 head_box=head_box,
                                 gt_label=gt_label,
                                 sub_network=use_pyramidbox)
            if use_pyramidbox:
                face_loss, head_loss, loss = network.train()
                fetches = [face_loss, head_loss]
            else:
                loss = network.vgg_ssd_loss()
                fetches = [loss]
            optimizer = optimizer_setting(train_params)
            optimizer.minimize(loss)
    return py_reader, fetches, loss

def train(args, config, train_params, train_file_list):
    batch_size = train_params["batch_size"]
    epoc_num = train_params["epoc_num"]
    optimizer_method = train_params["optimizer_method"]
    use_pyramidbox = train_params["use_pyramidbox"]

    use_gpu = args.use_gpu
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    batch_size_per_device = batch_size // devices_num
    iters_per_epoc = train_params["train_images"] // batch_size
    num_workers = 8
    is_shuffle = True

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

Z
zhengya01 已提交
151 152
    #only for ce
    if args.enable_ce:
u010070587's avatar
u010070587 已提交
153
        is_shuffle = False
Z
zhengya01 已提交
154 155 156 157 158 159 160 161
        SEED = 102
        startup_prog.random_seed = SEED
        train_prog.random_seed = SEED
        num_workers = 1
        pretrained_model = ""
        if args.batch_num != None:
            iters_per_epoc = args.batch_num

162 163 164 165 166
    train_py_reader, fetches, loss = build_program(
        train_params = train_params,
        main_prog = train_prog,
        startup_prog = startup_prog,
        args=args)
167

Q
qingqing01 已提交
168
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
169
    exe = fluid.Executor(place)
170
    exe.run(startup_prog)
171

172
    start_epoc = 0
173
    if pretrained_model:
174
        if pretrained_model.isdigit():
175
            start_epoc = int(pretrained_model) + 1
Q
qingqing01 已提交
176
            pretrained_model = os.path.join(model_save_dir, pretrained_model)
177 178
            print("Resume from %s " %(pretrained_model))

179 180 181
        if not os.path.exists(pretrained_model):
            raise ValueError("The pre-trained model path [%s] does not exist." %
                             (pretrained_model))
182 183
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
184 185 186 187 188 189
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
    train_reader = reader.train(config,
                                train_file_list,
                                batch_size_per_device,
                                shuffle = is_shuffle,
190 191
                                use_multiprocess=args.use_multiprocess,
                                num_workers=num_workers)
192
    train_py_reader.decorate_paddle_reader(train_reader)
193

194 195
    if args.parallel:
        train_exe = fluid.ParallelExecutor(
196 197 198
            main_program = train_prog,
            use_cuda=use_gpu,
            loss_name=loss.name)
199

200
    def save_model(postfix, program):
201 202 203
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
204

B
baiyf 已提交
205
        print('save models to %s' % (model_path))
Q
qingqing01 已提交
206
        fluid.io.save_persistables(exe, model_path, main_program=program)
207

208 209 210 211 212 213 214 215 216 217 218 219 220
    total_time = 0.0
    epoch_idx = 0
    face_loss = 0
    head_loss = 0
    for pass_id in range(start_epoc, epoc_num):
        epoch_idx += 1
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
        batch_id = 0
        train_py_reader.start()
        while True:
            try:
221 222 223 224 225
                prev_start_time = start_time
                start_time = time.time()
                if args.parallel:
                    fetch_vars = train_exe.run(fetch_list=
                        [v.name for v in fetches])
226
                else:
227
                    fetch_vars = exe.run(train_prog, fetch_list=fetches)
228 229
                end_time = time.time()
                fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
230 231
                face_loss = fetch_vars[0]
                head_loss = fetch_vars[1]
232 233
                if batch_id % 10 == 0:
                    if not args.use_pyramidbox:
234
                        print("Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
235
                            pass_id, batch_id, face_loss,
236 237
                            start_time - prev_start_time))
                    else:
238 239 240
                        print("Pass {:d}, batch {:d}, face loss {:.6f}, " \
                              "head loss {:.6f}, " \
                              "time {:.5f}".format(pass_id,
241
                               batch_id, face_loss, head_loss,
242
                               start_time - prev_start_time))
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
                batch_id += 1
            except (fluid.core.EOFException, StopIteration):
                train_py_reader.reset()
                break
        epoch_end_time = time.time()
        total_time += epoch_end_time - start_time
        save_model(str(pass_id), train_prog)

    # only for ce
    if args.enable_ce:
        gpu_num = get_cards(args)
        print("kpis\teach_pass_duration_card%s\t%s" %
                (gpu_num, total_time / epoch_idx))
        print("kpis\ttrain_face_loss_card%s\t%s" %
                (gpu_num, face_loss))
        print("kpis\ttrain_head_loss_card%s\t%s" %
                (gpu_num, head_loss))

261

Z
zhengya01 已提交
262 263 264 265 266 267 268 269 270 271

def get_cards(args):
    if args.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return args.num_devices


272 273 274
if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
L
LielinJiang 已提交
275
    check_cuda(args.use_gpu)
276

B
baiyfbupt 已提交
277 278 279
    data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
    train_file_list = os.path.join(args.data_dir,
        'wider_face_split/wider_face_train_bbx_gt.txt')
280 281 282 283 284 285 286 287
    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
    image_shape = [3, int(args.resize_h), int(args.resize_w)]
    train_parameters["image_shape"] = image_shape
    train_parameters["use_pyramidbox"] = args.use_pyramidbox
    train_parameters["batch_size"] = args.batch_size
    train_parameters["lr"] = args.learning_rate
    train_parameters["epoc_num"] = args.epoc_num

288

Q
qingqing01 已提交
289
    config = reader.Settings(
290
        data_dir=data_dir,
291 292
        resize_h=image_shape[1],
        resize_w=image_shape[2],
293
        apply_distort=True,
Q
qingqing01 已提交
294
        apply_expand=False,
295
        mean_value=mean_BGR,
296
        ap_version='11point')
297
    train(args, config, train_parameters, train_file_list)