train.py 7.8 KB
Newer Older
D
dengkaipeng 已提交
1
#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
D
dengkaipeng 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
19

20

21 22 23 24 25
def set_paddle_flags(flags):
    for key, value in flags.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)

26

27
set_paddle_flags({
28
    'FLAGS_eager_delete_tensor_gb': 0,  # enable gc
29 30 31 32
    'FLAGS_memory_fraction_of_eager_deletion': 1,
    'FLAGS_fraction_of_gpu_memory_to_use': 0.98
})

D
dengkaipeng 已提交
33 34 35 36 37
import sys
import numpy as np
import random
import time
import shutil
K
Kaipeng Deng 已提交
38
import subprocess
39 40
from utility import (parse_args, print_arguments, 
                     SmoothedValue, check_gpu)
D
dengkaipeng 已提交
41 42 43

import paddle
import paddle.fluid as fluid
44
from paddle.fluid import profiler
D
dengkaipeng 已提交
45
import reader
D
dengkaipeng 已提交
46
from models.yolov3 import YOLOv3
D
dengkaipeng 已提交
47
from learning_rate import exponential_with_warmup_decay
T
tink2123 已提交
48
from config import cfg
49 50 51 52 53 54
import dist_utils

num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))

def get_device_num():
    # NOTE(zcd): for multi-processe training, each process use one GPU card.
55 56 57
    if num_trainers > 1:
        return 1
    return fluid.core.get_cuda_device_count()
D
dengkaipeng 已提交
58 59 60 61


def train():

62 63 64
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)

u010070587's avatar
u010070587 已提交
65
    if cfg.debug or args.enable_ce:
D
dengkaipeng 已提交
66 67 68 69
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        random.seed(0)
        np.random.seed(0)
u010070587's avatar
u010070587 已提交
70

D
dengkaipeng 已提交
71 72 73
    if not os.path.exists(cfg.model_save_dir):
        os.makedirs(cfg.model_save_dir)

D
dengkaipeng 已提交
74
    model = YOLOv3()
D
dengkaipeng 已提交
75
    model.build_model()
D
dengkaipeng 已提交
76
    input_size = cfg.input_size
D
dengkaipeng 已提交
77 78 79
    loss = model.loss()
    loss.persistable = True

80
    devices_num = get_device_num()
D
dengkaipeng 已提交
81 82
    print("Found {} CUDA devices.".format(devices_num))

D
dengkaipeng 已提交
83
    learning_rate = cfg.learning_rate
D
dengkaipeng 已提交
84 85 86
    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
D
dengkaipeng 已提交
87
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
D
dengkaipeng 已提交
88 89 90 91 92 93 94

    optimizer = fluid.optimizer.Momentum(
        learning_rate=exponential_with_warmup_decay(
            learning_rate=learning_rate,
            boundaries=boundaries,
            values=values,
            warmup_iter=cfg.warm_up_iter,
D
dengkaipeng 已提交
95 96 97
            warmup_factor=cfg.warm_up_factor),
        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
        momentum=cfg.momentum)
D
dengkaipeng 已提交
98 99
    optimizer.minimize(loss)

100 101
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace()
D
dengkaipeng 已提交
102 103
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
D
dengkaipeng 已提交
104

D
dengkaipeng 已提交
105
    if cfg.pretrain:
D
dengkaipeng 已提交
106 107
        if not os.path.exists(cfg.pretrain):
            print("Pretrain weights not found: {}".format(cfg.pretrain))
u010070587's avatar
u010070587 已提交
108

D
dengkaipeng 已提交
109
        def if_exist(var):
D
dengkaipeng 已提交
110
            return os.path.exists(os.path.join(cfg.pretrain, var.name))
u010070587's avatar
u010070587 已提交
111

D
dengkaipeng 已提交
112
        fluid.io.load_vars(exe, cfg.pretrain, predicate=if_exist)
D
dengkaipeng 已提交
113

u010070587's avatar
u010070587 已提交
114
    build_strategy = fluid.BuildStrategy()
115
    build_strategy.memory_optimize = False  #gc and memory optimize may conflict
116
    syncbn = cfg.syncbn
117
    if (syncbn and devices_num <= 1) or num_trainers > 1:
118 119 120
        print("Disable syncbn in single device")
        syncbn = False
    build_strategy.sync_batch_norm = syncbn
121 122 123 124 125 126 127

    exec_strategy = fluid.ExecutionStrategy()
    if cfg.use_gpu and num_trainers > 1:
        dist_utils.prepare_for_multi_process(exe, build_strategy,
                                             fluid.default_main_program())
        exec_strategy.num_threads = 1

u010070587's avatar
u010070587 已提交
128 129
    compile_program = fluid.compiler.CompiledProgram(fluid.default_main_program(
    )).with_data_parallel(
130 131 132
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)
D
dengkaipeng 已提交
133

D
dengkaipeng 已提交
134
    random_sizes = [cfg.input_size]
D
dengkaipeng 已提交
135 136 137
    if cfg.random_shape:
        random_sizes = [32 * i for i in range(10, 20)]

138 139
    total_iter = cfg.max_iter - cfg.start_iter
    mixup_iter = total_iter - cfg.no_mixup_iter
140

u010070587's avatar
u010070587 已提交
141 142 143
    shuffle = True
    if args.enable_ce:
        shuffle = False
144 145 146 147
    shuffle_seed = None
    # NOTE: yolov3 is a special model, if num_trainers > 1, each process
    # trian the completed dataset.
    # if num_trainers > 1: shuffle_seed  = 1
u010070587's avatar
u010070587 已提交
148 149 150 151
    train_reader = reader.train(
        input_size,
        batch_size=cfg.batch_size,
        shuffle=shuffle,
152
        shuffle_seed=shuffle_seed,
u010070587's avatar
u010070587 已提交
153 154 155
        total_iter=total_iter * devices_num,
        mixup_iter=mixup_iter * devices_num,
        random_sizes=random_sizes,
156 157
        use_multiprocess_reader=cfg.use_multiprocess_reader,
        num_workers=cfg.worker_num)
D
dengkaipeng 已提交
158 159
    py_reader = model.py_reader
    py_reader.decorate_paddle_reader(train_reader)
D
dengkaipeng 已提交
160 161 162 163 164

    def save_model(postfix):
        model_path = os.path.join(cfg.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
D
dengkaipeng 已提交
165
        fluid.io.save_persistables(exe, model_path)
D
dengkaipeng 已提交
166 167 168

    fetch_list = [loss]

D
dengkaipeng 已提交
169 170 171 172 173 174 175 176
    py_reader.start()
    smoothed_loss = SmoothedValue()
    try:
        start_time = time.time()
        prev_start_time = start_time
        snapshot_loss = 0
        snapshot_time = 0
        for iter_id in range(cfg.start_iter, cfg.max_iter):
D
dengkaipeng 已提交
177
            prev_start_time = start_time
D
dengkaipeng 已提交
178
            start_time = time.time()
u010070587's avatar
u010070587 已提交
179
            losses = exe.run(compile_program,
D
dengkaipeng 已提交
180
                             fetch_list=[v.name for v in fetch_list])
D
dengkaipeng 已提交
181 182 183 184 185 186
            smoothed_loss.add_value(np.mean(np.array(losses[0])))
            snapshot_loss += np.mean(np.array(losses[0]))
            snapshot_time += start_time - prev_start_time
            lr = np.array(fluid.global_scope().find_var('learning_rate')
                          .get_tensor())
            print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
u010070587's avatar
u010070587 已提交
187 188
                iter_id, lr[0],
                smoothed_loss.get_mean_value(), start_time - prev_start_time))
D
dengkaipeng 已提交
189
            sys.stdout.flush()
190 191 192 193 194 195 196
            #add profiler tools
            if args.is_profiler and iter_id == 5:
               profiler.start_profiler("All")
            elif args.is_profiler and iter_id == 10:
                 profiler.stop_profiler("total", args.profiler_path)
                 return

D
dengkaipeng 已提交
197 198
            if (iter_id + 1) % cfg.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
D
dengkaipeng 已提交
199 200
                print("Snapshot {} saved, average loss: {}, \
                      average time: {}".format(
u010070587's avatar
u010070587 已提交
201 202 203 204 205 206 207 208 209 210 211 212 213 214
                    iter_id + 1, snapshot_loss / float(cfg.snapshot_iter),
                    snapshot_time / float(cfg.snapshot_iter)))
                if args.enable_ce and iter_id == cfg.max_iter - 1:
                    if devices_num == 1:
                        print("kpis\ttrain_cost_1card\t%f" %
                              (snapshot_loss / float(cfg.snapshot_iter)))
                        print("kpis\ttrain_duration_1card\t%f" %
                              (snapshot_time / float(cfg.snapshot_iter)))
                    else:
                        print("kpis\ttrain_cost_8card\t%f" %
                              (snapshot_loss / float(cfg.snapshot_iter)))
                        print("kpis\ttrain_duration_8card\t%f" %
                              (snapshot_time / float(cfg.snapshot_iter)))

D
dengkaipeng 已提交
215 216 217 218 219
                snapshot_loss = 0
                snapshot_time = 0
    except fluid.core.EOFException:
        py_reader.reset()

D
dengkaipeng 已提交
220 221 222 223 224 225 226
    save_model('model_final')


if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    train()