train.py 8.6 KB
Newer Older
R
ruri 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

R
root 已提交
15 16 17
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
R
ruri 已提交
18

19 20 21 22
import os
import numpy as np
import time
import sys
23

24

25 26 27 28 29 30 31 32 33 34 35 36 37
def set_paddle_flags(flags):
    for key, value in flags.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags({
    'FLAGS_eager_delete_tensor_gb': 0,  # enable gc 
    'FLAGS_fraction_of_gpu_memory_to_use': 0.98
})
R
ruri 已提交
38

39
import paddle
40
import paddle.fluid as fluid
R
ruri 已提交
41 42
import reader
from utils import *
43
import models
R
ruri 已提交
44 45
from build_model import create_model

46

R
ruri 已提交
47
def build_program(is_train, main_prog, startup_prog, args):
R
ruri 已提交
48 49 50 51 52 53 54 55 56
    """build program, and add grad op in program accroding to different mode

    Args:
        is_train: mode: train or test
        main_prog: main program
        startup_prog: strartup program
        args: arguments

    Returns : 
57 58
        train mode: [Loss, global_lr, data_loader]
        test mode: [Loss, data_loader]
R
ruri 已提交
59
    """
60 61 62 63
    if args.model.startswith('EfficientNet'):
        is_test = False if is_train else True
        override_params = {"drop_connect_rate": args.drop_connect_rate}
        padding_type = args.padding_type
64
        use_se = args.use_se
65 66 67 68
        model = models.__dict__[args.model](is_test=is_test,
                                            override_params=override_params,
                                            padding_type=padding_type,
                                            use_se=use_se)
69 70
    else:
        model = models.__dict__[args.model]()
R
ruri 已提交
71
    with fluid.program_guard(main_prog, startup_prog):
R
ruri 已提交
72 73 74
        if args.random_seed:
            main_prog.random_seed = args.random_seed
            startup_prog.random_seed = args.random_seed
R
ruri 已提交
75
        with fluid.unique_name.guard():
76
            data_loader, loss_out = create_model(model, args, is_train)
R
ruri 已提交
77
            # add backward op in program
R
ruri 已提交
78
            if is_train:
R
ruri 已提交
79 80 81 82
                optimizer = create_optimizer(args)
                avg_cost = loss_out[0]
                optimizer.minimize(avg_cost)
                #XXX: fetch learning rate now, better implement is required here. 
R
root 已提交
83
                global_lr = optimizer._global_learning_rate()
R
ruri 已提交
84 85
                global_lr.persistable = True
                loss_out.append(global_lr)
86
                if args.use_ema:
87 88 89 90
                    global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter(
                    )
                    ema = ExponentialMovingAverage(
                        args.ema_decay, thres_steps=global_steps)
91 92
                    ema.update()
                    loss_out.append(ema)
93
            loss_out.append(data_loader)
R
ruri 已提交
94
    return loss_out
R
ruri 已提交
95

96 97 98

def validate(args, test_data_loader, exe, test_prog, test_fetch_list, pass_id,
             train_batch_metrics_record):
99 100 101
    test_batch_time_record = []
    test_batch_metrics_record = []
    test_batch_id = 0
102
    test_data_loader.start()
103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
    try:
        while True:
            t1 = time.time()
            test_batch_metrics = exe.run(program=test_prog,
                                         fetch_list=test_fetch_list)
            t2 = time.time()
            test_batch_elapse = t2 - t1
            test_batch_time_record.append(test_batch_elapse)

            test_batch_metrics_avg = np.mean(
                np.array(test_batch_metrics), axis=1)
            test_batch_metrics_record.append(test_batch_metrics_avg)

            print_info(pass_id, test_batch_id, args.print_step,
                       test_batch_metrics_avg, test_batch_elapse, "batch")
            sys.stdout.flush()
            test_batch_id += 1

    except fluid.core.EOFException:
122
        test_data_loader.reset()
123 124 125 126 127 128 129 130 131 132 133
    #train_epoch_time_avg = np.mean(np.array(train_batch_time_record))
    train_epoch_metrics_avg = np.mean(
        np.array(train_batch_metrics_record), axis=0)

    test_epoch_time_avg = np.mean(np.array(test_batch_time_record))
    test_epoch_metrics_avg = np.mean(
        np.array(test_batch_metrics_record), axis=0)

    print_info(pass_id, 0, 0,
               list(train_epoch_metrics_avg) + list(test_epoch_metrics_avg),
               test_epoch_time_avg, "epoch")
R
ruri 已提交
134

135

R
ruri 已提交
136
def train(args):
R
ruri 已提交
137 138 139 140 141
    """Train model
    
    Args:
        args: all arguments.    
    """
R
ruri 已提交
142 143 144
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()
R
ruri 已提交
145 146 147 148 149 150

    train_out = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
151
    train_data_loader = train_out[-1]
152 153 154 155 156
    if args.use_ema:
        train_fetch_vars = train_out[:-2]
        ema = train_out[-2]
    else:
        train_fetch_vars = train_out[:-1]
157 158

    train_fetch_list = [var.name for var in train_fetch_vars]
R
ruri 已提交
159 160 161 162 163 164

    test_out = build_program(
        is_train=False,
        main_prog=test_prog,
        startup_prog=startup_prog,
        args=args)
165
    test_data_loader = test_out[-1]
R
ruri 已提交
166
    test_fetch_vars = test_out[:-1]
167 168

    test_fetch_list = [var.name for var in test_fetch_vars]
R
ruri 已提交
169 170

    #Create test_prog and set layers' is_test params to True
R
ruri 已提交
171
    test_prog = test_prog.clone(for_test=True)
172

173 174
    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
175
    exe = fluid.Executor(place)
R
ruri 已提交
176
    exe.run(startup_prog)
177

R
ruri 已提交
178 179
    #init model by checkpoint or pretrianed model.
    init_model(exe, args, train_prog)
180 181 182 183
    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None)
    train_reader = imagenet_reader.train(settings=args)
    test_reader = imagenet_reader.val(settings=args)
R
ruri 已提交
184

185 186
    train_data_loader.set_sample_list_generator(train_reader, place)
    test_data_loader.set_sample_list_generator(test_reader, place)
R
ruri 已提交
187 188

    compiled_train_prog = best_strategy_compiled(args, train_prog,
189 190
                                                 train_fetch_vars[0], exe)
    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
R
ruri 已提交
191
    for pass_id in range(args.num_epochs):
192
        if num_trainers > 1:
193 194
            imagenet_reader.set_shuffle_seed(pass_id + (
                args.random_seed if args.random_seed else 0))
R
ruri 已提交
195 196 197
        train_batch_id = 0
        train_batch_time_record = []
        train_batch_metrics_record = []
R
ruri 已提交
198

199
        train_data_loader.start()
R
ruri 已提交
200

R
ruri 已提交
201 202 203
        try:
            while True:
                t1 = time.time()
R
ruri 已提交
204 205
                train_batch_metrics = exe.run(compiled_train_prog,
                                              fetch_list=train_fetch_list)
R
ruri 已提交
206
                t2 = time.time()
R
ruri 已提交
207 208 209 210 211
                train_batch_elapse = t2 - t1
                train_batch_time_record.append(train_batch_elapse)
                train_batch_metrics_avg = np.mean(
                    np.array(train_batch_metrics), axis=1)
                train_batch_metrics_record.append(train_batch_metrics_avg)
212 213
                if trainer_id == 0:
                    print_info(pass_id, train_batch_id, args.print_step,
214 215
                               train_batch_metrics_avg, train_batch_elapse,
                               "batch")
216
                    sys.stdout.flush()
R
ruri 已提交
217 218
                train_batch_id += 1

R
ruri 已提交
219
        except fluid.core.EOFException:
220
            train_data_loader.reset()
221

222 223 224 225
        if trainer_id == 0:
            if args.use_ema:
                print('ExponentialMovingAverage validate start...')
                with ema.apply(exe):
226 227 228
                    validate(args, test_data_loader, exe, test_prog,
                             test_fetch_list, pass_id,
                             train_batch_metrics_record)
229
                print('ExponentialMovingAverage validate over!')
R
ruri 已提交
230

231 232
            validate(args, test_data_loader, exe, test_prog, test_fetch_list,
                     pass_id, train_batch_metrics_record)
233 234 235
            #For now, save model per epoch.
            if pass_id % args.save_step == 0:
                save_model(args, exe, train_prog, pass_id)
236

237

238
def main():
R
ruri 已提交
239
    args = parse_args()
240 241
    if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
        print_arguments(args)
R
ruri 已提交
242
    check_args(args)
243
    train(args)
244

245 246 247

if __name__ == '__main__':
    main()