main.py 13.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function

import os
import time
import argparse
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
import nets
import reader
from utils import ArgumentGroup

parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints")

train_g = ArgumentGroup(parser, "training", "training options.")
J
JesseyXujin 已提交
31 32
train_g.add_arg("epoch", int, 50, "Number of epoches for training.")
train_g.add_arg("save_steps", int, 200,
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
                "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 200,
                "The steps interval to evaluate model performance.")
train_g.add_arg("lr", float, 0.002, "The Learning rate value for training.")
train_g.add_arg("padding_size", int, 150,
                "The padding size for input sequences.")

log_g = ArgumentGroup(parser, "logging", "logging related")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log")

data_g = ArgumentGroup(parser, "data",
                       "Data paths, vocab paths and data processing options")
data_g.add_arg("data_dir", str, "./senta_data/", "Path to training data.")
data_g.add_arg("vocab_path", str, "./senta_data/word_dict.txt",
               "Vocabulary path.")
data_g.add_arg("vocab_size", int, 33256, "Vocabulary path.")
J
JesseyXujin 已提交
50
data_g.add_arg("batch_size", int, 256,
51 52 53 54
               "Total examples' number in batch for training.")
data_g.add_arg("random_seed", int, 0, "Random seed.")

run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
D
Divano 已提交
55
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
56 57 58
run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
J
JesseyXujin 已提交
59
run_type_g.add_arg("profile_steps", int, 60000,
60
                   "The steps interval to record the performance.")
J
JesseyXujin 已提交
61
train_g.add_arg("model_type", str, "bow_net", "Model type of training.")
D
Divano 已提交
62
parser.add_argument("--ce", action="store_true", help="run ce")
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84

args = parser.parse_args()

if args.use_cuda:
    place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
    dev_count = fluid.core.get_cuda_device_count()
else:
    place = fluid.CPUPlace()
    dev_count = 1

import paddle.fluid.profiler as profiler
import contextlib


@contextlib.contextmanager
def profile_context(profile=True):
    if profile:
        with profiler.profiler('All', 'total', '/tmp/profile_file'):
            yield
    else:
        yield

D
Divano 已提交
85 86 87 88 89
if args.ce:
    print("ce mode")
    seed = 90
    np.random.seed(seed)
    fluid.default_startup_program().random_seed = seed
J
JesseyXujin 已提交
90 91
    fluid.default_main_program().random_seed = seed

D
Divano 已提交
92

93 94
def train():
    with fluid.dygraph.guard(place):
D
Divano 已提交
95 96 97 98 99
        if args.ce:
            print("ce mode")
            seed = 90
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
J
JesseyXujin 已提交
100
            fluid.default_main_program().random_seed = seed
101 102 103 104 105 106 107 108 109
        processor = reader.SentaProcessor(
            data_dir=args.data_dir,
            vocab_path=args.vocab_path,
            random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        num_train_examples = processor.get_num_examples(phase="train")

        max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
J
JesseyXujin 已提交
110

D
Divano 已提交
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
        if not args.ce:
            train_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='train',
                epoch=args.epoch,
                shuffle=True)

            eval_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='dev',
                epoch=args.epoch,
                shuffle=False)
        else:
            train_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='train',
                epoch=args.epoch,
                shuffle=False)

            eval_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='dev',
                epoch=args.epoch,
                shuffle=False)
J
JesseyXujin 已提交
135
        if args.model_type == 'cnn_net':
136
            model = nets.CNN( args.vocab_size, args.batch_size,
J
JesseyXujin 已提交
137 138
                             args.padding_size)
        elif args.model_type == 'bow_net':
139
            model = nets.BOW( args.vocab_size, args.batch_size,
J
JesseyXujin 已提交
140
                             args.padding_size)
141
        elif args.model_type == 'gru_net':
142
            model = nets.GRU( args.vocab_size, args.batch_size,
143 144
                             args.padding_size)
        elif args.model_type == 'bigru_net':
145
            model = nets.BiGRU( args.vocab_size, args.batch_size,
J
JesseyXujin 已提交
146
                             args.padding_size)
147
        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr,parameter_list=model.parameters())
148 149
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
J
JesseyXujin 已提交
150
        gru_hidden_data = np.zeros((args.batch_size, 128), dtype='float32')
151 152 153 154 155 156 157 158 159 160 161 162 163 164
        for eop in range(args.epoch):
            time_begin = time.time()
            for batch_id, data in enumerate(train_data_generator()):
                enable_profile = steps > args.profile_steps
                with profile_context(enable_profile):
                    steps += 1
                    doc = to_variable(
                        np.array([
                            np.pad(x[0][0:args.padding_size], (
                                0, args.padding_size - len(x[0][
                                    0:args.padding_size])),
                                   'constant',
                                   constant_values=(args.vocab_size))
                            for x in data
165
                        ]).astype('int64').reshape(-1))
166 167 168
                    label = to_variable(
                        np.array([x[1] for x in data]).astype('int64').reshape(
                            args.batch_size, 1))
J
JesseyXujin 已提交
169 170
                    model.train()
                    avg_cost, prediction, acc = model(doc, label)
171 172 173 174
                    avg_cost.backward()
                    np_mask = (doc.numpy() != args.vocab_size).astype('int32')
                    word_num = np.sum(np_mask)
                    sgd_optimizer.minimize(avg_cost)
J
JesseyXujin 已提交
175
                    model.clear_gradients()
176 177 178
                    total_cost.append(avg_cost.numpy() * word_num)
                    total_acc.append(acc.numpy() * word_num)
                    total_num_seqs.append(word_num)
J
JesseyXujin 已提交
179
 
180 181 182 183 184 185 186 187 188 189 190 191 192 193
                    if steps % args.skip_steps == 0:
                        time_end = time.time()
                        used_time = time_end - time_begin
                        print("step: %d, ave loss: %f, "
                              "ave acc: %f, speed: %f steps/s" %
                              (steps,
                               np.sum(total_cost) / np.sum(total_num_seqs),
                               np.sum(total_acc) / np.sum(total_num_seqs),
                               args.skip_steps / used_time))
                        total_cost, total_acc, total_num_seqs = [], [], []
                        time_begin = time.time()

                    if steps % args.validation_steps == 0:
                        total_eval_cost, total_eval_acc, total_eval_num_seqs = [], [], []
J
JesseyXujin 已提交
194
                        model.eval()
195
                        eval_steps = 0
J
JesseyXujin 已提交
196
                        gru_hidden_data = np.zeros((args.batch_size, 128), dtype='float32')
197 198 199 200 201 202 203 204 205
                        for eval_batch_id, eval_data in enumerate(
                                eval_data_generator()):
                            eval_np_doc = np.array([
                                np.pad(x[0][0:args.padding_size],
                                       (0, args.padding_size -
                                        len(x[0][0:args.padding_size])),
                                       'constant',
                                       constant_values=(args.vocab_size))
                                for x in eval_data
206
                            ]).astype('int64').reshape(-1)
207 208 209
                            eval_label = to_variable(
                                np.array([x[1] for x in eval_data]).astype(
                                    'int64').reshape(args.batch_size, 1))
210
                            eval_doc = to_variable(eval_np_doc)
J
JesseyXujin 已提交
211
                            eval_avg_cost, eval_prediction, eval_acc = model(
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
                                eval_doc, eval_label)
                            eval_np_mask = (
                                eval_np_doc != args.vocab_size).astype('int32')
                            eval_word_num = np.sum(eval_np_mask)
                            total_eval_cost.append(eval_avg_cost.numpy() *
                                                   eval_word_num)
                            total_eval_acc.append(eval_acc.numpy() *
                                                  eval_word_num)
                            total_eval_num_seqs.append(eval_word_num)

                            eval_steps += 1

                        time_end = time.time()
                        used_time = time_end - time_begin
                        print(
                            "Final validation result: step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" %
                            (steps, np.sum(total_eval_cost) /
                             np.sum(total_eval_num_seqs), np.sum(total_eval_acc)
                             / np.sum(total_eval_num_seqs),
                             eval_steps / used_time))
                        time_begin = time.time()
D
Divano 已提交
234
                        if args.ce:
J
JesseyXujin 已提交
235 236 237 238 239 240
                            print("kpis\ttrain_loss\t%0.3f" %
                                  (np.sum(total_eval_cost) /
                                   np.sum(total_eval_num_seqs)))
                            print("kpis\ttrain_acc\t%0.3f" %
                                  (np.sum(total_eval_acc) /
                                   np.sum(total_eval_num_seqs)))
241 242

                    if steps % args.save_steps == 0:
J
JesseyXujin 已提交
243
                        save_path = args.checkpoints+"/"+"save_dir_" + str(steps)
244
                        print('save model to: ' + save_path)
J
JesseyXujin 已提交
245 246
                        fluid.dygraph.save_dygraph(model.state_dict(),
                                                   save_path)
D
Divano 已提交
247
                if enable_profile:
J
JesseyXujin 已提交
248 249
                    print('save profile result into /tmp/profile_file')
                    return
250 251 252 253 254 255 256 257 258 259 260 261 262 263


def infer():
    with fluid.dygraph.guard(place):
        processor = reader.SentaProcessor(
            data_dir=args.data_dir,
            vocab_path=args.vocab_path,
            random_seed=args.random_seed)

        infer_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='infer',
            epoch=args.epoch,
            shuffle=False)
J
JesseyXujin 已提交
264
        if args.model_type == 'cnn_net':
265
            model_infer = nets.CNN( args.vocab_size, args.batch_size,
J
JesseyXujin 已提交
266 267
                                   args.padding_size)
        elif args.model_type == 'bow_net':
268
            model_infer = nets.BOW( args.vocab_size, args.batch_size,
J
JesseyXujin 已提交
269
                                   args.padding_size)
270
        elif args.model_type == 'gru_net':
271
            model_infer = nets.GRU( args.vocab_size, args.batch_size,
272 273
                                   args.padding_size)
        elif args.model_type == 'bigru_net':
274
            model_infer = nets.BiGRU( args.vocab_size, args.batch_size,
J
JesseyXujin 已提交
275
                                   args.padding_size)
276
        print('Do inferring ...... ')
277
        restore, _ = fluid.load_dygraph(args.checkpoints)
J
JesseyXujin 已提交
278 279 280
        model_infer.set_dict(restore)
        model_infer.eval()
        total_acc, total_num_seqs = [], []
281 282 283 284
        steps = 0
        time_begin = time.time()
        for batch_id, data in enumerate(infer_data_generator()):
            steps += 1
J
JesseyXujin 已提交
285 286 287 288 289 290
            np_doc = np.array([np.pad(x[0][0:args.padding_size],
                                       (0, args.padding_size -
                                        len(x[0][0:args.padding_size])),
                                       'constant',
                                       constant_values=(args.vocab_size))
                                for x in data
291 292
            ]).astype('int64').reshape(-1)
            doc = to_variable(np_doc)
293 294 295
            label = to_variable(
                np.array([x[1] for x in data]).astype('int64').reshape(
                    args.batch_size, 1))
J
JesseyXujin 已提交
296
            _, _, acc = model_infer(doc, label)
297 298 299 300 301
            mask = (np_doc != args.vocab_size).astype('int32')
            word_num = np.sum(mask)
            total_acc.append(acc.numpy() * word_num)
            total_num_seqs.append(word_num)
        time_end = time.time()
J
JesseyXujin 已提交
302
        used_time = time_end - time_begin       
303 304 305 306 307 308 309 310 311 312 313 314 315 316
        print("Final infer result: ave acc: %f, speed: %f steps/s" %
              (np.sum(total_acc) / np.sum(total_num_seqs), steps / used_time))


def main():
    if args.do_train:
        train()
    elif args.do_infer:
        infer()


if __name__ == '__main__':
    print(args)
    main()