train.py 8.3 KB
Newer Older
R
ruri 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

R
root 已提交
15 16 17
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
R
ruri 已提交
18

19 20 21 22
import os
import numpy as np
import time
import sys
23

24 25 26 27 28 29 30 31 32 33 34 35 36
def set_paddle_flags(flags):
    for key, value in flags.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags({
    'FLAGS_eager_delete_tensor_gb': 0,  # enable gc 
    'FLAGS_fraction_of_gpu_memory_to_use': 0.98
})
R
ruri 已提交
37

38
import paddle
39
import paddle.fluid as fluid
R
ruri 已提交
40 41
import reader
from utils import *
42
import models
R
ruri 已提交
43 44
from build_model import create_model

R
ruri 已提交
45
def build_program(is_train, main_prog, startup_prog, args):
R
ruri 已提交
46 47 48 49 50 51 52 53 54 55 56 57
    """build program, and add grad op in program accroding to different mode

    Args:
        is_train: mode: train or test
        main_prog: main program
        startup_prog: strartup program
        args: arguments

    Returns : 
        train mode: [Loss, global_lr, py_reader]
        test mode: [Loss, py_reader]
    """
58 59 60 61 62 63 64
    if args.model.startswith('EfficientNet'):
        is_test = False if is_train else True
        override_params = {"drop_connect_rate": args.drop_connect_rate}
        padding_type = args.padding_type
        model = models.__dict__[args.model](is_test=is_test, override_params=override_params, padding_type=padding_type)
    else:
        model = models.__dict__[args.model]()
R
ruri 已提交
65
    with fluid.program_guard(main_prog, startup_prog):
R
ruri 已提交
66 67 68
        if args.random_seed:
            main_prog.random_seed = args.random_seed
            startup_prog.random_seed = args.random_seed
R
ruri 已提交
69
        with fluid.unique_name.guard():
R
ruri 已提交
70 71
            py_reader, loss_out = create_model(model, args, is_train)
            # add backward op in program
R
ruri 已提交
72
            if is_train:
R
ruri 已提交
73 74 75 76
                optimizer = create_optimizer(args)
                avg_cost = loss_out[0]
                optimizer.minimize(avg_cost)
                #XXX: fetch learning rate now, better implement is required here. 
R
root 已提交
77
                global_lr = optimizer._global_learning_rate()
R
ruri 已提交
78 79
                global_lr.persistable = True
                loss_out.append(global_lr)
80 81 82 83 84
                if args.use_ema:
                    global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter()
                    ema = ExponentialMovingAverage(args.ema_decay, thres_steps=global_steps)
                    ema.update()
                    loss_out.append(ema)
R
ruri 已提交
85 86
            loss_out.append(py_reader)
    return loss_out
R
ruri 已提交
87

88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
def validate(args, test_py_reader, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record):
    test_batch_time_record = []
    test_batch_metrics_record = []
    test_batch_id = 0
    test_py_reader.start()
    try:
        while True:
            t1 = time.time()
            test_batch_metrics = exe.run(program=test_prog,
                                         fetch_list=test_fetch_list)
            t2 = time.time()
            test_batch_elapse = t2 - t1
            test_batch_time_record.append(test_batch_elapse)

            test_batch_metrics_avg = np.mean(
                np.array(test_batch_metrics), axis=1)
            test_batch_metrics_record.append(test_batch_metrics_avg)

            print_info(pass_id, test_batch_id, args.print_step,
                       test_batch_metrics_avg, test_batch_elapse, "batch")
            sys.stdout.flush()
            test_batch_id += 1

    except fluid.core.EOFException:
        test_py_reader.reset()
    #train_epoch_time_avg = np.mean(np.array(train_batch_time_record))
    train_epoch_metrics_avg = np.mean(
        np.array(train_batch_metrics_record), axis=0)

    test_epoch_time_avg = np.mean(np.array(test_batch_time_record))
    test_epoch_metrics_avg = np.mean(
        np.array(test_batch_metrics_record), axis=0)

    print_info(pass_id, 0, 0,
               list(train_epoch_metrics_avg) + list(test_epoch_metrics_avg),
               test_epoch_time_avg, "epoch")
R
ruri 已提交
124 125

def train(args):
R
ruri 已提交
126 127 128 129 130
    """Train model
    
    Args:
        args: all arguments.    
    """
R
ruri 已提交
131 132 133
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
R
ruri 已提交
134 135 136 137 138 139 140

    train_out = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    train_py_reader = train_out[-1]
141 142 143 144 145
    if args.use_ema:
        train_fetch_vars = train_out[:-2]
        ema = train_out[-2]
    else:
        train_fetch_vars = train_out[:-1]
146 147

    train_fetch_list = [var.name for var in train_fetch_vars]
R
ruri 已提交
148 149 150 151 152 153 154 155

    test_out = build_program(
        is_train=False,
        main_prog=test_prog,
        startup_prog=startup_prog,
        args=args)
    test_py_reader = test_out[-1]
    test_fetch_vars = test_out[:-1]
156 157

    test_fetch_list = [var.name for var in test_fetch_vars]
R
ruri 已提交
158 159

    #Create test_prog and set layers' is_test params to True
R
ruri 已提交
160
    test_prog = test_prog.clone(for_test=True)
161

162 163
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
164
    exe = fluid.Executor(place)
R
ruri 已提交
165
    exe.run(startup_prog)
166

R
ruri 已提交
167 168
    #init model by checkpoint or pretrianed model.
    init_model(exe, args, train_prog)
169 170 171 172
    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None)
    train_reader = imagenet_reader.train(settings=args)
    test_reader = imagenet_reader.val(settings=args)
R
ruri 已提交
173 174 175 176 177

    train_py_reader.decorate_sample_list_generator(train_reader, place)
    test_py_reader.decorate_sample_list_generator(test_reader, place)

    compiled_train_prog = best_strategy_compiled(args, train_prog,
178 179
                                                 train_fetch_vars[0], exe)
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
R
ruri 已提交
180
    for pass_id in range(args.num_epochs):
181 182
        if num_trainers > 1:
            imagenet_reader.set_shuffle_seed(pass_id + (args.random_seed if args.random_seed else 0))
R
ruri 已提交
183 184 185
        train_batch_id = 0
        train_batch_time_record = []
        train_batch_metrics_record = []
R
ruri 已提交
186 187

        train_py_reader.start()
R
ruri 已提交
188

R
ruri 已提交
189 190 191
        try:
            while True:
                t1 = time.time()
R
ruri 已提交
192 193
                train_batch_metrics = exe.run(compiled_train_prog,
                                              fetch_list=train_fetch_list)
R
ruri 已提交
194
                t2 = time.time()
R
ruri 已提交
195 196 197 198 199
                train_batch_elapse = t2 - t1
                train_batch_time_record.append(train_batch_elapse)
                train_batch_metrics_avg = np.mean(
                    np.array(train_batch_metrics), axis=1)
                train_batch_metrics_record.append(train_batch_metrics_avg)
200 201 202 203
                if trainer_id == 0:
                    print_info(pass_id, train_batch_id, args.print_step,
                               train_batch_metrics_avg, train_batch_elapse, "batch")
                    sys.stdout.flush()
R
ruri 已提交
204 205
                train_batch_id += 1

R
ruri 已提交
206 207
        except fluid.core.EOFException:
            train_py_reader.reset()
208

209 210 211 212 213 214
        if trainer_id == 0:
            if args.use_ema:
                print('ExponentialMovingAverage validate start...')
                with ema.apply(exe):
                    validate(args, test_py_reader, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record)
                print('ExponentialMovingAverage validate over!')
R
ruri 已提交
215

216 217 218 219
            validate(args, test_py_reader, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record)
            #For now, save model per epoch.
            if pass_id % args.save_step == 0:
                save_model(args, exe, train_prog, pass_id)
220

221

222
def main():
R
ruri 已提交
223
    args = parse_args()
224 225
    if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
        print_arguments(args)
R
ruri 已提交
226
    check_args(args)
227
    train(args)
228

229 230 231

if __name__ == '__main__':
    main()