main.py 12.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function

import os
import time
import argparse
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
import nets
import reader
from utils import ArgumentGroup

parser = argparse.ArgumentParser(__doc__)
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
model_g.add_arg("checkpoints", str, "checkpoints", "Path to save checkpoints")

train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("epoch", int, 10, "Number of epoches for training.")
train_g.add_arg("save_steps", int, 1000,
                "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 200,
                "The steps interval to evaluate model performance.")
train_g.add_arg("lr", float, 0.002, "The Learning rate value for training.")
train_g.add_arg("padding_size", int, 150,
                "The padding size for input sequences.")

log_g = ArgumentGroup(parser, "logging", "logging related")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")
log_g.add_arg("verbose", bool, False, "Whether to output verbose log")

data_g = ArgumentGroup(parser, "data",
                       "Data paths, vocab paths and data processing options")
data_g.add_arg("data_dir", str, "./senta_data/", "Path to training data.")
data_g.add_arg("vocab_path", str, "./senta_data/word_dict.txt",
               "Vocabulary path.")
data_g.add_arg("vocab_size", int, 33256, "Vocabulary path.")
data_g.add_arg("batch_size", int, 16,
               "Total examples' number in batch for training.")
data_g.add_arg("random_seed", int, 0, "Random seed.")

run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
D
Divano 已提交
55
run_type_g.add_arg("use_cuda", bool, True, "If set, use GPU for training.")
56 57 58 59 60
run_type_g.add_arg("do_train", bool, True, "Whether to perform training.")
run_type_g.add_arg("do_val", bool, True, "Whether to perform evaluation.")
run_type_g.add_arg("do_infer", bool, False, "Whether to perform inference.")
run_type_g.add_arg("profile_steps", int, 15000,
                   "The steps interval to record the performance.")
J
JesseyXujin 已提交
61
train_g.add_arg("model_type", str, "bow_net", "Model type of training.")
D
Divano 已提交
62
parser.add_argument("--ce", action="store_true", help="run ce")
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

args = parser.parse_args()

if args.use_cuda:
    place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
    dev_count = fluid.core.get_cuda_device_count()
else:
    place = fluid.CPUPlace()
    dev_count = 1

import paddle.fluid.profiler as profiler
import contextlib


@contextlib.contextmanager
def profile_context(profile=True):
    if profile:
        with profiler.profiler('All', 'total', '/tmp/profile_file'):
            yield
    else:
        yield


D
Divano 已提交
86 87 88 89 90
if args.ce:
    print("ce mode")
    seed = 90
    np.random.seed(seed)
    fluid.default_startup_program().random_seed = seed
J
JesseyXujin 已提交
91 92
    fluid.default_main_program().random_seed = seed

D
Divano 已提交
93

94 95
def train():
    with fluid.dygraph.guard(place):
D
Divano 已提交
96 97 98 99 100
        if args.ce:
            print("ce mode")
            seed = 90
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
J
JesseyXujin 已提交
101
            fluid.default_main_program().random_seed = seed
102 103 104 105 106 107 108 109 110
        processor = reader.SentaProcessor(
            data_dir=args.data_dir,
            vocab_path=args.vocab_path,
            random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        num_train_examples = processor.get_num_examples(phase="train")

        max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count
J
JesseyXujin 已提交
111

D
Divano 已提交
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
        if not args.ce:
            train_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='train',
                epoch=args.epoch,
                shuffle=True)

            eval_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='dev',
                epoch=args.epoch,
                shuffle=False)
        else:
            train_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='train',
                epoch=args.epoch,
                shuffle=False)

            eval_data_generator = processor.data_generator(
                batch_size=args.batch_size,
                phase='dev',
                epoch=args.epoch,
                shuffle=False)
J
JesseyXujin 已提交
136 137 138 139 140 141
        if args.model_type == 'cnn_net':
            model = nets.CNN("cnn_net", args.vocab_size, args.batch_size,
                             args.padding_size)
        elif args.model_type == 'bow_net':
            model = nets.BOW("bow_net", args.vocab_size, args.batch_size,
                             args.padding_size)
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
        sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr)
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []

        for eop in range(args.epoch):
            time_begin = time.time()
            for batch_id, data in enumerate(train_data_generator()):
                enable_profile = steps > args.profile_steps

                with profile_context(enable_profile):

                    steps += 1
                    doc = to_variable(
                        np.array([
                            np.pad(x[0][0:args.padding_size], (
                                0, args.padding_size - len(x[0][
                                    0:args.padding_size])),
                                   'constant',
                                   constant_values=(args.vocab_size))
                            for x in data
                        ]).astype('int64').reshape(-1, 1))

                    label = to_variable(
                        np.array([x[1] for x in data]).astype('int64').reshape(
                            args.batch_size, 1))

J
JesseyXujin 已提交
168 169
                    model.train()
                    avg_cost, prediction, acc = model(doc, label)
170 171 172 173
                    avg_cost.backward()
                    np_mask = (doc.numpy() != args.vocab_size).astype('int32')
                    word_num = np.sum(np_mask)
                    sgd_optimizer.minimize(avg_cost)
J
JesseyXujin 已提交
174
                    model.clear_gradients()
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
                    total_cost.append(avg_cost.numpy() * word_num)
                    total_acc.append(acc.numpy() * word_num)
                    total_num_seqs.append(word_num)

                    if steps % args.skip_steps == 0:
                        time_end = time.time()
                        used_time = time_end - time_begin
                        print("step: %d, ave loss: %f, "
                              "ave acc: %f, speed: %f steps/s" %
                              (steps,
                               np.sum(total_cost) / np.sum(total_num_seqs),
                               np.sum(total_acc) / np.sum(total_num_seqs),
                               args.skip_steps / used_time))
                        total_cost, total_acc, total_num_seqs = [], [], []
                        time_begin = time.time()

                    if steps % args.validation_steps == 0:
                        total_eval_cost, total_eval_acc, total_eval_num_seqs = [], [], []
J
JesseyXujin 已提交
193
                        model.eval()
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
                        eval_steps = 0
                        for eval_batch_id, eval_data in enumerate(
                                eval_data_generator()):
                            eval_np_doc = np.array([
                                np.pad(x[0][0:args.padding_size],
                                       (0, args.padding_size -
                                        len(x[0][0:args.padding_size])),
                                       'constant',
                                       constant_values=(args.vocab_size))
                                for x in eval_data
                            ]).astype('int64').reshape(1, -1)
                            eval_label = to_variable(
                                np.array([x[1] for x in eval_data]).astype(
                                    'int64').reshape(args.batch_size, 1))
                            eval_doc = to_variable(eval_np_doc.reshape(-1, 1))
J
JesseyXujin 已提交
209
                            eval_avg_cost, eval_prediction, eval_acc = model(
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
                                eval_doc, eval_label)

                            eval_np_mask = (
                                eval_np_doc != args.vocab_size).astype('int32')
                            eval_word_num = np.sum(eval_np_mask)
                            total_eval_cost.append(eval_avg_cost.numpy() *
                                                   eval_word_num)
                            total_eval_acc.append(eval_acc.numpy() *
                                                  eval_word_num)
                            total_eval_num_seqs.append(eval_word_num)

                            eval_steps += 1

                        time_end = time.time()
                        used_time = time_end - time_begin
                        print(
                            "Final validation result: step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" %
                            (steps, np.sum(total_eval_cost) /
                             np.sum(total_eval_num_seqs), np.sum(total_eval_acc)
                             / np.sum(total_eval_num_seqs),
                             eval_steps / used_time))
                        time_begin = time.time()
D
Divano 已提交
233
                        if args.ce:
J
JesseyXujin 已提交
234 235 236 237 238 239
                            print("kpis\ttrain_loss\t%0.3f" %
                                  (np.sum(total_eval_cost) /
                                   np.sum(total_eval_num_seqs)))
                            print("kpis\ttrain_acc\t%0.3f" %
                                  (np.sum(total_eval_acc) /
                                   np.sum(total_eval_num_seqs)))
240 241 242 243

                    if steps % args.save_steps == 0:
                        save_path = "save_dir_" + str(steps)
                        print('save model to: ' + save_path)
J
JesseyXujin 已提交
244 245
                        fluid.dygraph.save_dygraph(model.state_dict(),
                                                   save_path)
D
Divano 已提交
246
                if enable_profile:
J
JesseyXujin 已提交
247 248
                    print('save profile result into /tmp/profile_file')
                    return
249 250 251 252 253 254 255 256 257 258 259 260 261 262


def infer():
    with fluid.dygraph.guard(place):
        processor = reader.SentaProcessor(
            data_dir=args.data_dir,
            vocab_path=args.vocab_path,
            random_seed=args.random_seed)

        infer_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='infer',
            epoch=args.epoch,
            shuffle=False)
J
JesseyXujin 已提交
263 264 265 266 267 268
        if args.model_type == 'cnn_net':
            model_infer = nets.CNN("cnn_net", args.vocab_size, args.batch_size,
                                   args.padding_size)
        elif args.model_type == 'bow_net':
            model_infer = nets.BOW("bow_net", args.vocab_size, args.batch_size,
                                   args.padding_size)
269 270 271
        print('Do inferring ...... ')
        total_acc, total_num_seqs = [], []

272 273
        restore, _ = fluid.load_dygraph(args.checkpoints)
        cnn_net_infer.set_dict(restore)
274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
        cnn_net_infer.eval()

        steps = 0
        time_begin = time.time()
        for batch_id, data in enumerate(infer_data_generator()):
            steps += 1
            np_doc = np.array([
                np.pad(x[0][0:args.padding_size],
                       (0, args.padding_size - len(x[0][0:args.padding_size])),
                       'constant',
                       constant_values=(args.vocab_size)) for x in data
            ]).astype('int64').reshape(-1, 1)
            doc = to_variable(np_doc)
            label = to_variable(
                np.array([x[1] for x in data]).astype('int64').reshape(
                    args.batch_size, 1))

J
JesseyXujin 已提交
291
            _, _, acc = model_infer(doc, label)
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314

            mask = (np_doc != args.vocab_size).astype('int32')
            word_num = np.sum(mask)
            total_acc.append(acc.numpy() * word_num)
            total_num_seqs.append(word_num)

        time_end = time.time()
        used_time = time_end - time_begin

        print("Final infer result: ave acc: %f, speed: %f steps/s" %
              (np.sum(total_acc) / np.sum(total_num_seqs), steps / used_time))


def main():
    if args.do_train:
        train()
    elif args.do_infer:
        infer()


if __name__ == '__main__':
    print(args)
    main()