train.py 17.8 KB
Newer Older
P
phlrain 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import time
import os
import random
import math
24
import contextlib
25
from distutils.dir_util import mkpath
P
phlrain 已提交
26 27
import paddle
import paddle.fluid as fluid
28
from paddle.fluid import profiler
P
phlrain 已提交
29
import paddle.fluid.framework as framework
30
import paddle.fluid.profiler as profiler
P
phlrain 已提交
31 32 33 34 35 36 37 38
from paddle.fluid.executor import Executor

import reader

import sys
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")
P
pkpk 已提交
39
sys.path.append('../shared_modules/')
P
phlrain 已提交
40 41 42 43
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

from args import *
44
from models.model_check import check_cuda, check_version
Y
Yibing Liu 已提交
45
from models.language_model import lm_model
46
from config import RNNConfig
P
phlrain 已提交
47 48 49 50 51 52
import logging
import pickle

SEED = 123


53
@contextlib.contextmanager
54
def profile_context(profile=True, profiler_path='/tmp/paddingrnn.profile'):
55
    if profile:
56
        with profiler.profiler('All', 'total', profiler_path):
57 58 59 60 61
            yield
    else:
        yield


P
phlrain 已提交
62
def get_current_model_para(train_prog, train_exe):
L
Li Fuchen 已提交
63
    param_list = train_prog.all_parameters()
P
phlrain 已提交
64 65 66 67 68 69 70 71 72 73 74 75
    param_name_list = [p.name for p in param_list]

    vals = {}
    for p_name in param_name_list:
        p_array = np.array(fluid.global_scope().find_var(p_name).get_tensor())
        vals[p_name] = p_array

    return vals


def save_para_npz(train_prog, train_exe):
    print("begin to save model to model_base")
L
Li Fuchen 已提交
76
    param_list = train_prog.all_parameters()
P
phlrain 已提交
77 78 79 80 81 82 83 84 85 86 87 88
    param_name_list = [p.name for p in param_list]

    vals = {}
    for p_name in param_name_list:
        p_array = np.array(fluid.global_scope().find_var(p_name).get_tensor())
        vals[p_name] = p_array

    emb = vals["embedding_para"]
    print("begin to save model to model_base")
    np.savez("mode_base", **vals)


89
def main():
P
phlrain 已提交
90
    args = parse_args()
91

92
    # check if set use_gpu=True in paddlepaddle cpu version
93
    check_cuda(args.use_gpu)
94 95
    # check if paddlepaddle version is satisfied
    check_version()
96

P
phlrain 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    logger = logging.getLogger("lm")
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    if args.log_path:
        file_handler = logging.FileHandler(args.log_path)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    else:
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
    logger.info('Running with args : {}'.format(args))

113 114
    config = RNNConfig(args)

115 116 117
    if not os.path.exists(args.save_model_dir):
        mkpath(args.save_model_dir)

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    # define train program
    main_program = fluid.Program()
    startup_program = fluid.Program()
    if args.enable_ce:
        startup_program.random_seed = SEED
    with fluid.program_guard(main_program, startup_program):
        with fluid.unique_name.guard():
            res_vars = lm_model.lm_model(
                config.hidden_size,
                config.vocab_size,
                num_layers=config.num_layers,
                num_steps=config.num_steps,
                init_scale=config.init_scale,
                dropout=config.dropout,
                rnn_model=config.rnn_model,
133
                use_dataloader=args.use_dataloader)
134

135 136
            if args.use_dataloader:
                dataloader = res_vars[-1]
137 138 139
                res_vars = res_vars[:-1]
            loss, last_hidden, last_cell, feed_order = res_vars

R
root 已提交
140 141
            clip1 = fluid.clip.GradientClipByGlobalNorm(
                clip_norm=config.max_grad_norm)
142 143 144 145 146 147 148 149

            learning_rate = fluid.layers.create_global_var(
                name="learning_rate",
                shape=[1],
                value=1.0,
                dtype='float32',
                persistable=True)

R
root 已提交
150 151
            optimizer = fluid.optimizer.SGD(learning_rate=learning_rate,
                                            grad_clip=clip1)
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
            optimizer.minimize(loss)

    # define inference program
    inference_program = fluid.Program()
    inference_startup_program = fluid.Program()
    with fluid.program_guard(inference_program, inference_startup_program):
        with fluid.unique_name.guard():
            lm_model.lm_model(
                config.hidden_size,
                config.vocab_size,
                num_layers=config.num_layers,
                num_steps=config.num_steps,
                init_scale=config.init_scale,
                dropout=config.dropout,
                rnn_model=config.rnn_model,
167
                use_dataloader=False)
168 169 170
    # Some op behaves differently for train and inference, we need to call
    # this clone function to ensure every op is right for inference.
    inference_program = inference_program.clone(for_test=True)
P
phlrain 已提交
171

Y
Yibing Liu 已提交
172
    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
P
phlrain 已提交
173
    exe = Executor(place)
174 175
    exe.run(startup_program)

176 177 178 179 180
    if args.init_from_pretrain_model:
        if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
            print(args.init_from_pretrain_model)
            raise Warning("The pretrained params do not exist.")
            return
181
        fluid.load(main_program, args.init_from_pretrain_model, exe)
182 183 184
        print("finish initing model from pretrained params from %s" %
              (args.init_from_pretrain_model))

185 186 187 188 189 190 191 192 193
    device_count = len(fluid.cuda_places()) if args.use_gpu else len(
        fluid.cpu_places())

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = device_count
    exec_strategy.num_iteration_per_drop_scope = 100

    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = True
194 195 196 197 198 199
    try:
        fluid.require_version(min_version='1.7.0')
        build_strategy.enable_auto_fusion = args.enable_auto_fusion
    except Exception as e:
        logger.info("PaddlePaddle version 1.7.0 or higher is "
                    "required when you want to enable fusion_group.")
200 201 202 203 204 205 206 207 208

    if args.parallel:
        train_program = fluid.compiler.CompiledProgram(
            main_program).with_data_parallel(
                loss_name=loss.name,
                build_strategy=build_strategy,
                exec_strategy=exec_strategy)
    else:
        train_program = fluid.compiler.CompiledProgram(main_program)
P
phlrain 已提交
209 210 211

    data_path = args.data_path
    print("begin to load data")
H
Hongyu Liu 已提交
212
    ptb_data = reader.get_ptb_data(data_path)
P
phlrain 已提交
213
    print("finished load data")
H
Hongyu Liu 已提交
214
    train_data, valid_data, test_data = ptb_data
P
phlrain 已提交
215

216
    def generate_init_data():
217
        batch_size = config.batch_size * device_count
218
        init_hidden = np.zeros(
219
            (batch_size, config.num_layers, config.hidden_size),
220 221
            dtype='float32')
        init_cell = np.zeros(
222
            (batch_size, config.num_layers, config.hidden_size),
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
            dtype='float32')
        return init_hidden, init_cell

    def generate_new_lr(epoch_id=0, device_count=1):
        new_lr = config.base_learning_rate * (config.lr_decay**max(
            epoch_id + 1 - config.epoch_start_decay, 0.0))
        lr = np.ones((device_count), dtype='float32') * new_lr
        return lr

    def prepare_input(batch,
                      init_hidden=None,
                      init_cell=None,
                      epoch_id=0,
                      with_lr=True,
                      device_count=1):
P
phlrain 已提交
238
        x, y = batch
239
        x = x.reshape((-1, config.num_steps, 1))
P
phlrain 已提交
240 241
        y = y.reshape((-1, 1))

242
        res = {}
P
phlrain 已提交
243 244
        res['x'] = x
        res['y'] = y
245 246 247 248
        if init_hidden is not None:
            res['init_hidden'] = init_hidden
        if init_cell is not None:
            res['init_cell'] = init_cell
P
phlrain 已提交
249
        if with_lr:
250
            res['learning_rate'] = generate_new_lr(epoch_id, device_count)
P
phlrain 已提交
251 252 253 254 255

        return res

    def eval(data):
        # when eval the batch_size set to 1
256 257
        eval_data_iter = reader.get_data_iter(data, config.batch_size *
                                              device_count, config.num_steps)
P
phlrain 已提交
258 259
        total_loss = 0.0
        iters = 0
260
        init_hidden, init_cell = generate_init_data()
P
phlrain 已提交
261 262
        for batch_id, batch in enumerate(eval_data_iter):
            input_data_feed = prepare_input(
263
                batch, init_hidden, init_cell, epoch_id=0, with_lr=False)
P
phlrain 已提交
264
            fetch_outs = exe.run(
265
                program=inference_program,
P
phlrain 已提交
266
                feed=input_data_feed,
L
liuhongyu 已提交
267
                fetch_list=[loss.name, last_hidden.name, last_cell.name],
H
Hongyu Liu 已提交
268
                use_program_cache=False)
P
phlrain 已提交
269

270
            cost_eval = np.array(fetch_outs[0])
P
phlrain 已提交
271 272 273
            init_hidden = np.array(fetch_outs[1])
            init_cell = np.array(fetch_outs[2])

274 275
            total_loss += cost_eval
            iters += config.num_steps
P
phlrain 已提交
276 277 278 279

        ppl = np.exp(total_loss / iters)
        return ppl

280 281 282 283 284
    def get_log_interval(data_len):
        num_batchs = data_len // config.batch_size
        epoch_size = (num_batchs - 1) // config.num_steps
        log_interval = max(1, epoch_size // 10)
        return log_interval
P
phlrain 已提交
285

286 287 288
    def train_an_epoch(epoch_id, batch_times):
        # get train epoch size
        log_interval = get_log_interval(len(train_data))
289 290
        train_data_iter = reader.get_data_iter(train_data, config.batch_size *
                                               device_count, config.num_steps)
P
phlrain 已提交
291 292 293

        total_loss = 0
        iters = 0
H
Hongyu Liu 已提交
294 295

        init_hidden, init_cell = generate_init_data()
P
phlrain 已提交
296 297
        for batch_id, batch in enumerate(train_data_iter):
            input_data_feed = prepare_input(
298 299 300 301 302 303 304 305 306
                batch,
                init_hidden=init_hidden,
                init_cell=init_cell,
                epoch_id=epoch_id,
                with_lr=True,
                device_count=device_count)
            batch_start_time = time.time()
            fetch_outs = exe.run(train_program,
                                 feed=input_data_feed,
307 308 309 310
                                 fetch_list=[
                                     loss.name, "learning_rate",
                                     last_hidden.name, last_cell.name
                                 ],
P
phlrain 已提交
311
                                 use_program_cache=True)
312 313
            batch_time = time.time() - batch_start_time
            batch_times.append(batch_time)
P
phlrain 已提交
314 315

            cost_train = np.array(fetch_outs[0])
316
            lr = np.array(fetch_outs[1])
H
Hongyu Liu 已提交
317 318
            init_hidden = np.array(fetch_outs[2])
            init_cell = np.array(fetch_outs[3])
P
phlrain 已提交
319
            total_loss += cost_train
320
            iters += config.num_steps
P
phlrain 已提交
321 322
            if batch_id > 0 and batch_id % log_interval == 0:
                ppl = np.exp(total_loss / iters)
323 324 325
                print(
                    "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                    % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
L
Li Fuchen 已提交
326

327 328 329 330 331
            # profiler tools for benchmark
            if args.profile and batch_id == log_interval:
                profiler.reset_profiler()
            elif args.profile and batch_id == (log_interval + 5):
                break
P
phlrain 已提交
332
        ppl = np.exp(total_loss / iters)
333
        return ppl
P
phlrain 已提交
334

335
    def train_an_epoch_dataloader(epoch_id, batch_times):
336 337
        # get train epoch size
        log_interval = get_log_interval(len(train_data))
P
phlrain 已提交
338

339
        init_hidden, init_cell = generate_init_data()
Z
zhengya01 已提交
340

341 342 343
        total_loss = 0
        iters = 0

344
        dataloader.start()
345 346 347 348 349 350 351 352 353 354 355 356 357 358
        batch_id = 0
        try:
            while True:
                data_feeds = {}
                if batch_id == 0:
                    batch_time = 0
                    batch_start_time = time.time()
                else:
                    batch_time = time.time() - batch_start_time
                    batch_times.append(batch_time)
                    batch_start_time = time.time()

                new_lr = generate_new_lr(epoch_id, device_count)
                data_feeds['learning_rate'] = new_lr
H
Hongyu Liu 已提交
359 360
                data_feeds["init_hidden"] = init_hidden
                data_feeds["init_cell"] = init_cell
361 362 363

                fetch_outs = exe.run(train_program,
                                     feed=data_feeds,
364 365 366 367
                                     fetch_list=[
                                         loss.name, "learning_rate",
                                         last_hidden.name, last_cell.name
                                     ],
368 369 370 371
                                     use_program_cache=True)

                cost_train = np.array(fetch_outs[0])
                lr = np.array(fetch_outs[1])
372 373
                init_hidden = np.array(fetch_outs[2])
                init_cell = np.array(fetch_outs[3])
374 375 376 377 378 379 380 381 382 383 384

                total_loss += cost_train
                iters += config.num_steps
                if batch_id > 0 and (log_interval == 0 or
                                     batch_id % log_interval == 0):
                    ppl = np.exp(total_loss / iters)
                    print(
                        "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
                        % (epoch_id, batch_id, batch_time, ppl[0], lr[0]))

                batch_id += 1
385 386 387 388 389
                # profiler tools for benchmark
                if args.profile and batch_id == log_interval:
                    profiler.reset_profiler()
                elif args.profile and batch_id == (log_interval + 5):
                    break
390
        except fluid.core.EOFException:
391
            dataloader.reset()
392 393 394 395 396 397

        batch_times.append(time.time() - batch_start_time)
        ppl = np.exp(total_loss / iters)
        return ppl

    def train():
398
        if args.use_dataloader:
399 400

            def data_gen():
401
                data_iter_size = config.batch_size
402 403 404 405 406 407 408 409
                train_batches = reader.get_data_iter(train_data, data_iter_size,
                                                     config.num_steps)
                for batch in train_batches:
                    x, y = batch
                    x = x.reshape((-1, config.num_steps, 1))
                    y = y.reshape((-1, 1))
                    yield x, y

410
            dataloader.set_batch_generator(data_gen)
411 412 413 414 415

        total_time = 0.0
        for epoch_id in range(config.max_epoch):
            batch_times = []
            epoch_start_time = time.time()
416 417
            if args.use_dataloader:
                train_ppl = train_an_epoch_dataloader(epoch_id, batch_times)
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
            else:
                train_ppl = train_an_epoch(epoch_id, batch_times)
            epoch_time = time.time() - epoch_start_time
            total_time += epoch_time
            print(
                "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n"
                % (epoch_id, epoch_time, train_ppl[0],
                   len(batch_times) / sum(batch_times)))

            # FIXME(zjl): ppl[0] increases as batch_size increases. 
            # We should find a better way to calculate ppl by normalizing batch_size. 
            if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[
                    0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print(
                    "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
                )
                print("Abort this training process and please start again.")
                return

            if epoch_id == config.max_epoch - 1 and args.enable_ce:
                # kpis
                print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" %
                      (args.rnn_model, device_count,
                       total_time / config.max_epoch))
                print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" %
                      (args.rnn_model, device_count, train_ppl[0]))

447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
            if not args.profile:
                # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100
                # Just skip to avoid error
                def is_valid_data(data, batch_size, num_steps):
                    data_len = len(data)
                    batch_len = data_len // batch_size
                    epoch_size = (batch_len - 1) // num_steps
                    return epoch_size >= 1

                valid_data_valid = is_valid_data(valid_data, config.batch_size,
                                                 config.num_steps)
                if valid_data_valid:
                    valid_ppl = eval(valid_data)
                    print("Valid ppl: %.5f" % valid_ppl[0])
                else:
                    print(
                        'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'.
                        format(
                            len(valid_data), config.batch_size,
                            config.num_steps))

                save_model_dir = os.path.join(args.save_model_dir,
                                              str(epoch_id))
                if not os.path.exists(save_model_dir):
                    mkpath(save_model_dir)
                save_model_dir = os.path.join(save_model_dir, 'params')

R
root 已提交
474
                fluid.save(program=main_program, model_path=save_model_dir)
475
                print("Saved model to: %s.\n" % save_model_dir)
Z
zhengya01 已提交
476

477
    with profile_context(args.profile, args.profiler_path):
478 479
        train()

480 481 482 483 484 485
    test_ppl = eval(test_data)
    print("Test ppl:", test_ppl[0])


if __name__ == '__main__':
    main()