train.py 7.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
J
jerrywgz 已提交
18
import os
19
import sys
J
jerrywgz 已提交
20
import numpy as np
21
import time
J
jerrywgz 已提交
22
import shutil
J
jerrywgz 已提交
23
from utility import parse_args, print_arguments, SmoothedValue
J
jerrywgz 已提交
24 25 26 27

import paddle
import paddle.fluid as fluid
import reader
28 29
import models.model_builder as model_builder
import models.resnet as resnet
J
jerrywgz 已提交
30
from learning_rate import exponential_with_warmup_decay
J
jerrywgz 已提交
31
from config import cfg
J
jerrywgz 已提交
32 33


J
jerrywgz 已提交
34
def train():
35
    learning_rate = cfg.learning_rate
J
jerrywgz 已提交
36
    image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
J
jerrywgz 已提交
37

Z
zhengya01 已提交
38
    if cfg.debug or cfg.enable_ce:
J
jerrywgz 已提交
39 40 41 42 43 44 45 46
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        import random
        random.seed(0)
        np.random.seed(0)

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
J
jerrywgz 已提交
47
    total_batch_size = devices_num * cfg.TRAIN.im_per_batch
J
jerrywgz 已提交
48

49 50 51 52
    model = model_builder.FasterRCNN(
        add_conv_body_func=resnet.add_ResNet50_conv4_body,
        add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
        use_pyreader=cfg.use_pyreader,
J
jerrywgz 已提交
53
        use_random=True)
54 55
    model.build_model(image_shape)
    loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss = model.loss()
J
jerrywgz 已提交
56 57 58 59
    loss_cls.persistable = True
    loss_bbox.persistable = True
    rpn_cls_loss.persistable = True
    rpn_reg_loss.persistable = True
60
    loss = loss_cls + loss_bbox + rpn_cls_loss + rpn_reg_loss
J
jerrywgz 已提交
61

J
jerrywgz 已提交
62 63
    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
64
    step_num = len(cfg.lr_steps)
J
jerrywgz 已提交
65
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
J
jerrywgz 已提交
66 67

    optimizer = fluid.optimizer.Momentum(
J
jerrywgz 已提交
68 69
        learning_rate=exponential_with_warmup_decay(
            learning_rate=learning_rate,
J
jerrywgz 已提交
70 71
            boundaries=boundaries,
            values=values,
J
jerrywgz 已提交
72 73 74 75
            warmup_iter=cfg.warm_up_iter,
            warmup_factor=cfg.warm_up_factor),
        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
        momentum=cfg.momentum)
J
jerrywgz 已提交
76 77 78 79
    optimizer.minimize(loss)

    fluid.memory_optimize(fluid.default_main_program())

80
    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
J
jerrywgz 已提交
81 82 83
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

84
    if cfg.pretrained_model:
J
jerrywgz 已提交
85

J
jerrywgz 已提交
86
        def if_exist(var):
87
            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))
J
jerrywgz 已提交
88

J
jerrywgz 已提交
89
        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)
J
jerrywgz 已提交
90

91
    if cfg.parallel:
J
jerrywgz 已提交
92
        train_exe = fluid.ParallelExecutor(
93 94 95
            use_cuda=bool(cfg.use_gpu), loss_name=loss.name)

    if cfg.use_pyreader:
J
jerrywgz 已提交
96
        train_reader = reader.train(
J
jerrywgz 已提交
97 98 99
            batch_size=cfg.TRAIN.im_per_batch,
            total_batch_size=total_batch_size,
            padding_total=cfg.TRAIN.padding_minibatch,
J
jerrywgz 已提交
100
            shuffle=True)
101 102 103
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
J
jerrywgz 已提交
104
        train_reader = reader.train(batch_size=total_batch_size, shuffle=True)
105
        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
J
jerrywgz 已提交
106 107

    def save_model(postfix):
108
        model_path = os.path.join(cfg.model_save_dir, postfix)
J
jerrywgz 已提交
109 110 111 112
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        fluid.io.save_persistables(exe, model_path)

113
    fetch_list = [loss, rpn_cls_loss, rpn_reg_loss, loss_cls, loss_bbox]
J
jerrywgz 已提交
114

J
jerrywgz 已提交
115
    def train_loop_pyreader():
116
        py_reader.start()
117
        smoothed_loss = SmoothedValue(cfg.log_window)
118 119 120
        try:
            start_time = time.time()
            prev_start_time = start_time
Z
zhengya01 已提交
121 122
            total_time = 0
            last_loss = 0
123
            every_pass_loss = []
124
            for iter_id in range(cfg.max_iter):
125 126 127 128
                prev_start_time = start_time
                start_time = time.time()
                losses = train_exe.run(fetch_list=[v.name for v in fetch_list])
                every_pass_loss.append(np.mean(np.array(losses[0])))
129
                smoothed_loss.add_value(np.mean(np.array(losses[0])))
J
jerrywgz 已提交
130 131
                lr = np.array(fluid.global_scope().find_var('learning_rate')
                              .get_tensor())
132
                print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
J
jerrywgz 已提交
133 134 135
                    iter_id, lr[0],
                    smoothed_loss.get_median_value(
                    ), start_time - prev_start_time))
Z
zhengya01 已提交
136 137 138 139
                end_time = time.time()
                total_time += end_time - start_time
                last_loss = np.mean(np.array(losses[0]))

140
                sys.stdout.flush()
J
jerrywgz 已提交
141
                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
142
                    save_model("model_iter{}".format(iter_id))
Z
zhengya01 已提交
143 144 145 146 147 148 149 150 151 152
            # only for ce
            if cfg.enable_ce:
                gpu_num = get_cards(cfg)
                epoch_idx = iter_id + 1
                loss = last_loss
                print("kpis\teach_pass_duration_card%s\t%s" %
                        (gpu_num, total_time / epoch_idx))
                print("kpis\ttrain_loss_card%s\t%s" %
                        (gpu_num, loss))

153 154 155
        except fluid.core.EOFException:
            py_reader.reset()
        return np.mean(every_pass_loss)
J
jerrywgz 已提交
156

J
jerrywgz 已提交
157
    def train_loop():
J
jerrywgz 已提交
158 159
        start_time = time.time()
        prev_start_time = start_time
J
jerrywgz 已提交
160
        start = start_time
Z
zhengya01 已提交
161 162
        total_time = 0
        last_loss = 0
J
jerrywgz 已提交
163
        every_pass_loss = []
164 165
        smoothed_loss = SmoothedValue(cfg.log_window)
        for iter_id, data in enumerate(train_reader()):
J
jerrywgz 已提交
166 167
            prev_start_time = start_time
            start_time = time.time()
168 169
            losses = train_exe.run(fetch_list=[v.name for v in fetch_list],
                                   feed=feeder.feed(data))
J
jerrywgz 已提交
170 171
            loss_v = np.mean(np.array(losses[0]))
            every_pass_loss.append(loss_v)
172
            smoothed_loss.add_value(loss_v)
J
jerrywgz 已提交
173 174
            lr = np.array(fluid.global_scope().find_var('learning_rate')
                          .get_tensor())
Z
zhengya01 已提交
175 176 177
            end_time = time.time()
            total_time += end_time - start_time
            last_loss = loss_v
178
            print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
J
jerrywgz 已提交
179 180
                iter_id, lr[0],
                smoothed_loss.get_median_value(), start_time - prev_start_time))
181
            sys.stdout.flush()
J
jerrywgz 已提交
182
            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
183 184 185
                save_model("model_iter{}".format(iter_id))
            if (iter_id + 1) == cfg.max_iter:
                break
Z
zhengya01 已提交
186 187 188 189 190 191 192 193 194 195
        # only for ce
        if cfg.enable_ce:
            gpu_num = get_cards(cfg)
            epoch_idx = iter_id + 1
            loss = last_loss
            print("kpis\teach_pass_duration_card%s\t%s" %
                    (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_loss_card%s\t%s" %
                    (gpu_num, loss))

196
        return np.mean(every_pass_loss)
J
jerrywgz 已提交
197

198
    if cfg.use_pyreader:
J
jerrywgz 已提交
199
        train_loop_pyreader()
200
    else:
J
jerrywgz 已提交
201
        train_loop()
202
    save_model('model_final')
J
jerrywgz 已提交
203

J
jerrywgz 已提交
204

Z
zhengya01 已提交
205 206 207 208 209 210 211 212 213
def get_cards(cfg):
    if cfg.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return cfg.num_devices


J
jerrywgz 已提交
214
if __name__ == '__main__':
J
jerrywgz 已提交
215
    args = parse_args()
J
jerrywgz 已提交
216
    print_arguments(args)
J
jerrywgz 已提交
217
    train()