train.py 10.3 KB
Newer Older
0
0YuanZhang0 已提交
1
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
Y
Yibing Liu 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on dialogue tasks."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys
import time
import numpy as np

import paddle.fluid as fluid

0
0YuanZhang0 已提交
27 28 29
from dgu_net import create_net
import dgu.reader as reader
from dgu.optimization import optimization
P
pkpk 已提交
30
import dgu.define_paradigm as define_paradigm
0
0YuanZhang0 已提交
31 32 33
from dgu.utils.configure import PDConfig
from dgu.utils.input_field import InputField
from dgu.utils.model_check import check_cuda
Y
Yibing Liu 已提交
34 35


0
0YuanZhang0 已提交
36 37
def do_train(args):
    """train function"""
P
pkpk 已提交
38

Y
Yibing Liu 已提交
39 40 41 42 43 44 45 46 47 48 49 50
    task_name = args.task_name.lower()
    paradigm_inst = define_paradigm.Paradigm(task_name)

    processors = {
        'udc': reader.UDCProcessor,
        'swda': reader.SWDAProcessor,
        'mrda': reader.MRDAProcessor,
        'atis_slot': reader.ATISSlotProcessor,
        'atis_intent': reader.ATISIntentProcessor,
        'dstc2': reader.DSTC2Processor,
    }

0
0YuanZhang0 已提交
51 52
    train_prog = fluid.default_main_program()
    startup_prog = fluid.default_startup_program()
Y
Yibing Liu 已提交
53

P
pkpk 已提交
54
    with fluid.program_guard(train_prog, startup_prog):
0
0YuanZhang0 已提交
55
        train_prog.random_seed = args.random_seed
Y
Yibing Liu 已提交
56
        startup_prog.random_seed = args.random_seed
P
pkpk 已提交
57
        with fluid.unique_name.guard():
0
0YuanZhang0 已提交
58 59
            num_labels = len(processors[task_name].get_labels())

0
0YuanZhang0 已提交
60
            src_ids = fluid.data(
P
pkpk 已提交
61
                name='src_ids', shape=[-1, args.max_seq_len], dtype='int64')
0
0YuanZhang0 已提交
62
            pos_ids = fluid.data(
P
pkpk 已提交
63
                name='pos_ids', shape=[-1, args.max_seq_len], dtype='int64')
0
0YuanZhang0 已提交
64
            sent_ids = fluid.data(
P
pkpk 已提交
65
                name='sent_ids', shape=[-1, args.max_seq_len], dtype='int64')
0
0YuanZhang0 已提交
66
            input_mask = fluid.data(
P
pkpk 已提交
67
                name='input_mask',
0
0YuanZhang0 已提交
68
                shape=[-1, args.max_seq_len, 1],
P
pkpk 已提交
69 70
                dtype='float32')
            if args.task_name == 'atis_slot':
0
0YuanZhang0 已提交
71
                labels = fluid.data(
P
pkpk 已提交
72
                    name='labels', shape=[-1, args.max_seq_len], dtype='int64')
0
0YuanZhang0 已提交
73
            elif args.task_name in ['dstc2']:
0
0YuanZhang0 已提交
74
                labels = fluid.data(
P
pkpk 已提交
75 76 77 78
                    name='labels', shape=[-1, num_labels], dtype='int64')
            else:
                labels = fluid.data(name='labels', shape=[-1, 1], dtype='int64')

0
0YuanZhang0 已提交
79 80
            input_inst = [src_ids, pos_ids, sent_ids, input_mask, labels]
            input_field = InputField(input_inst)
81 82 83

            data_reader = fluid.io.DataLoader.from_generator(
                feed_list=input_inst, capacity=4, iterable=False)
0
0YuanZhang0 已提交
84

0
0YuanZhang0 已提交
85 86 87 88 89 90 91 92 93
            processor = processors[task_name](data_dir=args.data_dir,
                                              vocab_path=args.vocab_path,
                                              max_seq_len=args.max_seq_len,
                                              do_lower_case=args.do_lower_case,
                                              in_tokens=args.in_tokens,
                                              task_name=task_name,
                                              random_seed=args.random_seed)

            results = create_net(
P
pkpk 已提交
94 95 96 97 98 99
                is_training=True,
                model_input=input_field,
                num_labels=num_labels,
                paradigm_inst=paradigm_inst,
                args=args)

0
0YuanZhang0 已提交
100 101 102 103 104
            loss = results.get("loss", None)
            probs = results.get("probs", None)
            accuracy = results.get("accuracy", None)
            num_seqs = results.get("num_seqs", None)

105 106
            places = fluid.cuda_places() if args.use_cuda else fluid.cpu_places(
            )
0
0YuanZhang0 已提交
107
            dev_count = len(places)
P
pkpk 已提交
108

0
0YuanZhang0 已提交
109
            batch_generator = processor.data_generator(
P
pkpk 已提交
110
                batch_size=args.batch_size, phase='train', shuffle=True)
0
0YuanZhang0 已提交
111
            num_train_examples = processor.get_num_examples(phase='train')
P
pkpk 已提交
112

0
0YuanZhang0 已提交
113 114 115
            if args.in_tokens:
                max_train_steps = args.epoch * num_train_examples // (
                    args.batch_size // args.max_seq_len) // dev_count
P
pkpk 已提交
116
            else:
0
0YuanZhang0 已提交
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
                max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

            warmup_steps = int(max_train_steps * args.warmup_proportion)
            print("Num train examples: %d" % num_train_examples)
            print("Max train steps: %d" % max_train_steps)
            print("Num warmup steps: %d" % warmup_steps)

            optimizor = optimization(
                loss=loss,
                warmup_steps=warmup_steps,
                num_train_steps=max_train_steps,
                learning_rate=args.learning_rate,
                train_program=train_prog,
                startup_prog=startup_prog,
                weight_decay=args.weight_decay,
                scheduler=args.lr_scheduler,
0
0YuanZhang0 已提交
133
                use_fp16=False,
0
0YuanZhang0 已提交
134 135
                loss_scaling=args.loss_scaling)

0
0YuanZhang0 已提交
136
    data_reader.set_batch_generator(batch_generator, places=places)
Y
Yibing Liu 已提交
137

0
0YuanZhang0 已提交
138 139
    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
P
pkpk 已提交
140
    else:
0
0YuanZhang0 已提交
141
        place = fluid.CPUPlace()
P
pkpk 已提交
142

0
0YuanZhang0 已提交
143 144
    exe = fluid.Executor(place)
    exe.run(startup_prog)
P
pkpk 已提交
145

146
    assert args.init_from_params or args.init_from_pretrain_model
0
0YuanZhang0 已提交
147 148

    # init from some checkpoint, to resume the previous training
149 150
    if args.init_from_params:
        fluid.load(train_prog, args.init_from_params, exe)
P
pkpk 已提交
151
    if args.init_from_pretrain_model:
152
        fluid.load(train_prog, args.init_from_pretrain_model, exe)
0
0YuanZhang0 已提交
153 154 155 156 157

    build_strategy = fluid.compiler.BuildStrategy()
    build_strategy.enable_inplace = True

    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
P
pkpk 已提交
158 159
        loss_name=loss.name, build_strategy=build_strategy)

0
0YuanZhang0 已提交
160 161 162 163
    # start training
    steps = 0
    time_begin = time.time()
    ce_info = []
P
pkpk 已提交
164
    for epoch_step in range(args.epoch):
0
0YuanZhang0 已提交
165
        data_reader.start()
Y
Yibing Liu 已提交
166
        while True:
P
pkpk 已提交
167
            try:
Y
Yibing Liu 已提交
168
                steps += 1
0
0YuanZhang0 已提交
169
                if steps % args.print_steps == 0:
P
pkpk 已提交
170 171 172 173 174 175
                    if warmup_steps <= 0:
                        if accuracy is not None:
                            fetch_list = [
                                loss.name, accuracy.name, num_seqs.name
                            ]
                        else:
Y
Yibing Liu 已提交
176
                            fetch_list = [loss.name, num_seqs.name]
P
pkpk 已提交
177
                    else:
Y
Yibing Liu 已提交
178 179
                        if accuracy is not None:
                            fetch_list = [
0
0YuanZhang0 已提交
180
                                loss.name, accuracy.name, optimizor.name,
Y
Yibing Liu 已提交
181 182
                                num_seqs.name
                            ]
P
pkpk 已提交
183 184
                        else:
                            fetch_list = [
0
0YuanZhang0 已提交
185
                                loss.name, optimizor.name, num_seqs.name
P
pkpk 已提交
186 187
                            ]
                else:
Y
Yibing Liu 已提交
188 189
                    fetch_list = []

0
0YuanZhang0 已提交
190
                outputs = exe.run(compiled_train_prog, fetch_list=fetch_list)
Y
Yibing Liu 已提交
191

0
0YuanZhang0 已提交
192
                if steps % args.print_steps == 0:
P
pkpk 已提交
193 194
                    if warmup_steps <= 0:
                        if accuracy is not None:
Y
Yibing Liu 已提交
195
                            np_loss, np_acc, np_num_seqs = outputs
P
pkpk 已提交
196
                        else:
Y
Yibing Liu 已提交
197
                            np_loss, np_num_seqs = outputs
P
pkpk 已提交
198
                    else:
Y
Yibing Liu 已提交
199 200
                        if accuracy is not None:
                            np_loss, np_acc, np_lr, np_num_seqs = outputs
P
pkpk 已提交
201
                        else:
Y
Yibing Liu 已提交
202 203 204 205
                            np_loss, np_lr, np_num_seqs = outputs

                    time_end = time.time()
                    used_time = time_end - time_begin
P
pkpk 已提交
206 207
                    current_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
P
pkpk 已提交
208 209 210 211 212 213
                    if accuracy is not None:
                        print("%s epoch: %d, step: %d, ave loss: %f, "
                              "ave acc: %f, speed: %f steps/s" %
                              (current_time, epoch_step, steps,
                               np.mean(np_loss), np.mean(np_acc),
                               args.print_steps / used_time))
P
pkpk 已提交
214
                        ce_info.append([
P
pkpk 已提交
215
                            np.mean(np_loss), np.mean(np_acc),
0
0YuanZhang0 已提交
216
                            args.print_steps / used_time
P
pkpk 已提交
217 218
                        ])
                    else:
P
pkpk 已提交
219 220 221 222 223 224
                        print("%s epoch: %d, step: %d, ave loss: %f, "
                              "speed: %f steps/s" %
                              (current_time, epoch_step, steps,
                               np.mean(np_loss), args.print_steps / used_time))
                        ce_info.append(
                            [np.mean(np_loss), args.print_steps / used_time])
Y
Yibing Liu 已提交
225 226
                    time_begin = time.time()

P
pkpk 已提交
227
                if steps % args.save_steps == 0:
228 229 230
                    model_path = os.path.join(args.save_model_path,
                                              "step_" + str(steps))
                    fluid.save(train_prog, model_path)
P
pkpk 已提交
231 232

            except fluid.core.EOFException:
0
0YuanZhang0 已提交
233
                data_reader.reset()
Y
Yibing Liu 已提交
234
                break
235 236 237

    model_path = os.path.join(args.save_model_path, "step_final")
    fluid.save(train_prog, model_path)
0
0YuanZhang0 已提交
238 239 240 241 242 243 244 245

    def get_cards():
        num = 0
        cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
        print("test_cards", cards)
        if cards != '':
            num = len(cards.split(","))
        return num
P
pkpk 已提交
246

0
0YuanZhang0 已提交
247
    if args.enable_ce:
Z
zhengya01 已提交
248
        card_num = get_cards()
0
0YuanZhang0 已提交
249
        print("test_card_num", card_num)
Z
zhengya01 已提交
250 251 252 253 254 255 256 257 258 259
        ce_loss = 0
        ce_acc = 0
        ce_time = 0
        try:
            ce_loss = ce_info[-2][0]
            ce_acc = ce_info[-2][1]
            ce_time = ce_info[-2][2]
        except:
            print("ce info error")
        print("kpis\teach_step_duration_%s_card%s\t%s" %
P
pkpk 已提交
260 261 262
              (task_name, card_num, ce_time))
        print("kpis\ttrain_loss_%s_card%s\t%f" % (task_name, card_num, ce_loss))
        print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc))
Z
zhengya01 已提交
263 264


P
pkpk 已提交
265 266
if __name__ == '__main__':

0
0YuanZhang0 已提交
267 268 269
    args = PDConfig(yaml_file="./data/config/dgu.yaml")
    args.build()
    args.Print()
P
pkpk 已提交
270 271 272

    check_cuda(args.use_cuda)

0
0YuanZhang0 已提交
273
    do_train(args)