train.py 7.2 KB
Newer Older
D
dengkaipeng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import numpy as np
import random
import time
import shutil
from utility import parse_args, print_arguments, SmoothedValue

import paddle
import paddle.fluid as fluid
import reader
T
tink2123 已提交
29
import models.yolov3 as models
D
dengkaipeng 已提交
30
from learning_rate import exponential_with_warmup_decay
T
tink2123 已提交
31
from config import cfg
D
dengkaipeng 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50


def train():

    if cfg.debug:
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        random.seed(0)
        np.random.seed(0)
        
    if not os.path.exists(cfg.model_save_dir):
        os.makedirs(cfg.model_save_dir)

    model = models.YOLOv3(cfg.model_cfg_path, use_pyreader=cfg.use_pyreader)
    model.build_model()
    input_size = model.get_input_size()
    loss = model.loss()
    loss.persistable = True

T
tink2123 已提交
51 52 53
    print("cfg.learning",cfg.learning_rate)
    print("cfg.decay",cfg.decay)    

D
dengkaipeng 已提交
54 55 56 57
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    print("Found {} CUDA devices.".format(devices_num))

T
tink2123 已提交
58
    learning_rate = float(cfg.learning_rate)
D
dengkaipeng 已提交
59 60 61
    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
D
dengkaipeng 已提交
62 63 64 65
    if isinstance(gamma, list):
        values = [learning_rate * g for g in gamma]
    else:
        values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
D
dengkaipeng 已提交
66 67 68 69 70 71 72 73 74

    optimizer = fluid.optimizer.Momentum(
        learning_rate=exponential_with_warmup_decay(
            learning_rate=learning_rate,
            boundaries=boundaries,
            values=values,
            warmup_iter=cfg.warm_up_iter,
            warmup_factor=cfg.warm_up_factor,
            start_step=cfg.start_iter),
T
tink2123 已提交
75 76
        regularization=fluid.regularizer.L2Decay(float(cfg.decay)),
        momentum=float(cfg.momentum))
D
dengkaipeng 已提交
77 78 79 80 81 82 83 84
    optimizer.minimize(loss)

    fluid.memory_optimize(fluid.default_main_program())

    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    base_exe = fluid.Executor(place)
    base_exe.run(fluid.default_startup_program())

D
dengkaipeng 已提交
85
    if cfg.pretrain:
D
dengkaipeng 已提交
86
        def if_exist(var):
D
dengkaipeng 已提交
87 88
            return os.path.exists(os.path.join(cfg.pretrain, var.name))
        fluid.io.load_vars(base_exe, cfg.pretrain, predicate=if_exist)
D
dengkaipeng 已提交
89 90 91 92 93 94

    if cfg.parallel:
        exe = fluid.ParallelExecutor( use_cuda=bool(cfg.use_gpu), loss_name=loss.name)
    else:
        exe = base_exe

D
dengkaipeng 已提交
95
    random_sizes = [cfg.input_size]
D
dengkaipeng 已提交
96 97 98
    if cfg.random_shape:
        random_sizes = [32 * i for i in range(10, 20)]

99
    mixup_iter = cfg.max_iter - cfg.start_iter - cfg.no_mixup_iter
D
dengkaipeng 已提交
100
    if cfg.use_pyreader:
T
tink2123 已提交
101
        train_reader = reader.train(input_size, batch_size=int(cfg.batch)/devices_num, shuffle=True, mixup_iter=mixup_iter*devices_num, random_sizes=random_sizes, interval=10, pyreader_num=devices_num, use_multiprocessing=cfg.use_multiprocess)
D
dengkaipeng 已提交
102 103 104
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
T
tink2123 已提交
105
        train_reader = reader.train(input_size, batch_size=int(cfg.batch), shuffle=True, mixup_iter=mixup_iter, random_sizes=random_sizes, use_multiprocessing=cfg.use_multiprocess)
D
dengkaipeng 已提交
106 107 108 109 110 111 112 113 114 115 116 117
        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())

    def save_model(postfix):
        model_path = os.path.join(cfg.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        fluid.io.save_persistables(base_exe, model_path)

    fetch_list = [loss]

    def train_loop_pyreader():
        py_reader.start()
118
        smoothed_loss = SmoothedValue()
D
dengkaipeng 已提交
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
        try:
            start_time = time.time()
            prev_start_time = start_time
            snapshot_loss = 0
            snapshot_time = 0
            for iter_id in range(cfg.start_iter, cfg.max_iter):
                prev_start_time = start_time
                start_time = time.time()
                losses = exe.run(fetch_list=[v.name for v in fetch_list])
                smoothed_loss.add_value(np.mean(np.array(losses[0])))
                snapshot_loss += np.mean(np.array(losses[0]))
                snapshot_time += start_time - prev_start_time
                lr = np.array(fluid.global_scope().find_var('learning_rate')
                              .get_tensor())
                print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
                    iter_id, lr[0],
135
                    smoothed_loss.get_mean_value(), start_time - prev_start_time))
D
dengkaipeng 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
                sys.stdout.flush()
                if (iter_id + 1) % cfg.snapshot_iter == 0:
                    save_model("model_iter{}".format(iter_id))
                    print("Snapshot {} saved, average loss: {}, average time: {}".format(
                        iter_id + 1, snapshot_loss / float(cfg.snapshot_iter), 
                        snapshot_time / float(cfg.snapshot_iter)))
                    snapshot_loss = 0
                    snapshot_time = 0
        except fluid.core.EOFException:
            py_reader.reset()

    def train_loop():
        start_time = time.time()
        prev_start_time = start_time
        start = start_time
151
        smoothed_loss = SmoothedValue()
D
dengkaipeng 已提交
152 153 154 155 156 157 158 159 160 161 162 163 164 165
        snapshot_loss = 0
        snapshot_time = 0
        for iter_id, data in enumerate(train_reader()):
            iter_id += cfg.start_iter
            prev_start_time = start_time
            start_time = time.time()
            losses = exe.run(fetch_list=[v.name for v in fetch_list],
                                   feed=feeder.feed(data))
            smoothed_loss.add_value(losses[0])
            snapshot_loss += losses[0]
            snapshot_time += start_time - prev_start_time
            lr = np.array(fluid.global_scope().find_var('learning_rate')
                          .get_tensor())
            print("Iter {:d}, lr: {:.6f}, loss: {:.4f}, time {:.5f}".format(
166
                iter_id, lr[0], smoothed_loss.get_mean_value(), start_time - prev_start_time))
D
dengkaipeng 已提交
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
            sys.stdout.flush()

            if (iter_id + 1) % cfg.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
                print("Snapshot {} saved, average loss: {}, average time: {}".format(
                    iter_id + 1, snapshot_loss / float(cfg.snapshot_iter), 
                    snapshot_time / float(cfg.snapshot_iter)))
                snapshot_loss = 0
                snapshot_time = 0
            if (iter_id + 1) == cfg.max_iter:
                print("Finish iter {}".format(iter_id))
                break

    if cfg.use_pyreader:
        train_loop_pyreader()
    else:
        train_loop()
    save_model('model_final')


if __name__ == '__main__':
    args = parse_args()
    print_arguments(args)
    train()