run_classifier.py 20.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import argparse
import numpy as np
import multiprocessing
import sys
# sys.path.append("../models/classification/")
from nets import textcnn_net_multi_label
import paddle
import paddle.fluid as fluid
from utils import ArgumentGroup, print_arguments, DataProcesser, DataReader, ConfigReader
from utils import init_checkpoint, check_version, logger
import random
import codecs
import logging
import math
np.random.seed(0)
random.seed(0)

parser = argparse.ArgumentParser(__doc__)
DEV_COUNT = 1
model_g = ArgumentGroup(parser, "model", "model configuration and paths.")
45 46 47 48
model_g.add_arg("init_checkpoint", str, None,
                "Init checkpoint to resume training from.")
model_g.add_arg("checkpoints", str, "./checkpoints",
                "Path to save checkpoints.")
49 50 51 52 53 54
model_g.add_arg("config_path", str, "./data/input/model.conf", "Model conf.")
model_g.add_arg("build_dict", bool, False, "Build dict.")

train_g = ArgumentGroup(parser, "training", "training options.")
train_g.add_arg("cpu_num", int, 3, "Number of Threads.")
train_g.add_arg("epoch", int, 100, "Number of epoches for training.")
55 56 57 58 59 60
train_g.add_arg("learning_rate", float, 0.1,
                "Learning rate used to train with warmup.")
train_g.add_arg("save_steps", int, 1000,
                "The steps interval to save checkpoints.")
train_g.add_arg("validation_steps", int, 100,
                "The steps interval to evaluate model performance.")
61
train_g.add_arg("random_seed", int, 7, "random seed")
62 63 64 65
train_g.add_arg(
    "threshold", float, 0.1,
    "When the confidence exceeds the threshold, the corresponding label is given."
)
66 67 68 69

log_g = ArgumentGroup(parser, "logging", "logging related.")
log_g.add_arg("skip_steps", int, 10, "The steps interval to print loss.")

70 71
data_g = ArgumentGroup(parser, "data",
                       "Data paths, vocab paths and data processing options")
72 73
data_g.add_arg("data_dir", str, "./data/input/", "Path to training data.")
data_g.add_arg("save_dir", str, "./data/output/", "Path to save.")
74 75 76 77
data_g.add_arg("max_seq_len", int, 50,
               "Tokens' number of the longest seqence allowed.")
data_g.add_arg("batch_size", int, 64,
               "The total number of examples in one batch for training.")
78 79 80 81

run_type_g = ArgumentGroup(parser, "run_type", "running type options.")
run_type_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
# run_type_g.add_arg("use_fast_executor", bool, False, "If set, use fast parallel executor (in experiment).")
82 83 84 85 86 87
run_type_g.add_arg("do_train", bool, True,
                   "Whether to perform evaluation on test data set.")
run_type_g.add_arg("do_eval", bool, True,
                   "Whether to perform evaluation on test data set.")
run_type_g.add_arg("do_test", bool, True,
                   "Whether to perform evaluation on test data set.")
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
args = parser.parse_args()


def get_score(pred_result, label, eval_phase):
    """[get precision recall and f-score]
    
    Arguments:
        pred_result {[type]} -- [pred labels]
        label {[type]} -- [origin labels]
    """
    tp = 0
    total = 0
    true_cnt = 0
    pred_pos_num = 0
    pos_num = 0
    for i in range(len(pred_result)):
        total += 1
        pred_labels = []
        actual_labels = []
107
        for j in range(1, len(pred_result[0])):  # the 0 one is background
108 109 110 111 112 113 114 115 116 117 118 119
            if pred_result[i][j] == 1:
                pred_labels.append(j)
            if label[i][j] == 1:
                actual_labels.append(j)
        if len(pred_labels) > 0:
            pred_pos_num += 1
        if len(actual_labels) > 0:
            pos_num += 1
            if set(actual_labels).issubset(set(pred_labels)):
                tp += 1
                true_cnt += 1
        elif len(pred_labels) == 0 and len(actual_labels) == 0:
120
            true_cnt += 1
121 122 123 124
    try:
        precision = tp * 1.0 / pred_pos_num
        recall = tp * 1.0 / pos_num
        f1 = 2 * precision * recall / (recall + precision)
125
    except Exception as e:
126 127 128 129 130 131 132 133 134 135
        precision = 0
        recall = 0
        f1 = 0
    acc = true_cnt * 1.0 / total
    logger.info("tp, pred_pos_num, pos_num, total")
    logger.info("%d, %d, %d, %d" % (tp, pred_pos_num, pos_num, total))
    logger.info("%s result is : precision is %f, recall is %f, f1_score is %f, acc is %f" % (eval_phase, precision, \
                recall, f1, acc))


A
aprilvkuo 已提交
136
def train(args, train_exe, build_res, place):
137 138 139 140 141 142 143 144 145 146
    """[train the net]
    
    Arguments:
        args {[type]} -- [description]
        train_exe {[type]} -- [description]
        compiled_prog{[type]} -- [description]
        build_res {[type]} -- [description]
        place {[type]} -- [description]
    """
    global DEV_COUNT
A
aprilvkuo 已提交
147
    compiled_prog = build_res["compiled_prog"]
148 149 150 151 152
    cost = build_res["cost"]
    prediction = build_res["prediction"]
    pred_label = build_res["pred_label"]
    label = build_res["label"]
    fetch_list = [cost.name, prediction.name, pred_label.name, label.name]
153
    train_data_loader = build_res["train_data_loader"]
154 155 156 157 158 159 160
    train_prog = build_res["train_prog"]
    steps = 0
    time_begin = time.time()
    test_exe = train_exe
    logger.info("Begin training")
    for i in range(args.epoch):
        try:
161
            for data in train_data_loader():
A
aprilvkuo 已提交
162 163 164 165 166 167
                avg_cost_np, avg_pred_np, pred_label, label = train_exe.run(feed=data, program=compiled_prog, \
                                                                            fetch_list=fetch_list)
                steps += 1
                if steps % int(args.skip_steps) == 0:
                    time_end = time.time()
                    used_time = time_end - time_begin
168
                    get_score(pred_label, label, eval_phase="Train")
A
aprilvkuo 已提交
169
                    logger.info('loss is {}'.format(avg_cost_np))
170 171
                    logger.info("epoch: %d, step: %d, speed: %f steps/s" %
                                (i, steps, args.skip_steps / used_time))
A
aprilvkuo 已提交
172 173 174
                    time_begin = time.time()
                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
175 176 177 178
                                             "step_" + str(steps))
                    fluid.io.save(train_prog, save_path)
                    logger.info("[save]step %d : save at %s" %
                                (steps, save_path))
A
aprilvkuo 已提交
179 180 181 182 183
                if steps % args.validation_steps == 0:
                    if args.do_eval:
                        evaluate(args, test_exe, build_res, "eval")
                    if args.do_test:
                        evaluate(args, test_exe, build_res, "test")
184 185 186 187 188
        except Exception as e:
            logger.exception(str(e))
            logger.error("Train error : %s" % str(e))
            exit(1)
    save_path = os.path.join(args.checkpoints, "step_" + str(steps))
189
    fluid.io.save(train_prog, save_path)
190 191 192
    logger.info("[save]step %d : save at %s" % (steps, save_path))


193 194 195 196 197 198
def evaluate(args,
             test_exe,
             build_res,
             eval_phase,
             save_result=False,
             id2intent=None):
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
    """[evaluate on dev/test dataset]
    
    Arguments:
        args {[type]} -- [description]
        test_exe {[type]} -- [description]
        test_prog {[type]} -- [description]
        build_res {[type]} -- [description]
        place {[type]} -- [description]
        eval_phase {[type]} -- [description]
    
    Keyword Arguments:
        threshold {float} -- [description] (default: {0.5})
        save_result {bool} -- [description] (default: {False})
        id2intent {[type]} -- [description] (default: {None})
    """
214
    place = build_res["test_place"]
215 216 217 218 219 220 221 222
    threshold = args.threshold
    cost = build_res["cost"]
    prediction = build_res["prediction"]
    pred_label = build_res["pred_label"]
    label = build_res["label"]
    fetch_list = [cost.name, prediction.name, pred_label.name, label.name]
    total_cost, total_acc, pred_prob_list, pred_label_list, label_list = [], [], [], [], []
    if eval_phase == "eval":
A
aprilvkuo 已提交
223
        test_prog = build_res["eval_compiled_prog"]
224
        test_data_loader = build_res["eval_data_loader"]
225
    elif eval_phase == "test":
A
aprilvkuo 已提交
226
        test_prog = build_res["test_compiled_prog"]
227
        test_data_loader = build_res["test_data_loader"]
228 229 230
    else:
        exit(1)
    logger.info("-----------------------------------------------------------")
231
    for data in test_data_loader():
232 233 234 235 236 237
        avg_cost_np, avg_pred_np, pred_label, label= test_exe.run(program=test_prog, fetch_list=fetch_list, feed=data, \
            return_numpy=True)
        total_cost.append(avg_cost_np)
        pred_prob_list.extend(avg_pred_np)
        pred_label_list.extend(pred_label)
        label_list.extend(label)
238

239
    if save_result:
240 241
        logger.info("save result at : %s" % args.save_dir + "/" + eval_phase +
                    ".rst")
242 243 244 245
        save_dir = args.save_dir
        if not os.path.exists(save_dir):
            logger.warning("save dir not exists, and create it")
            os.makedirs(save_dir)
246 247 248 249 250 251
        fin = codecs.open(
            os.path.join(args.data_dir, eval_phase + ".txt"),
            "r",
            encoding="utf8")
        fout = codecs.open(
            args.save_dir + "/" + eval_phase + ".rst", "w", encoding="utf8")
252 253 254 255 256 257 258 259 260 261
        for line in pred_prob_list:
            query = fin.readline().rsplit("\t", 1)[0]
            res = []
            for i in range(1, len(line)):
                if line[i] > threshold:
                    #res.append(id2intent[i]+":"+str(line[i]))
                    res.append(id2intent[i])
            if len(res) == 0:
                res.append(id2intent[0])
            fout.write("%s\t%s\n" % (query, "\2".join(sorted(res))))
262
        fout.close()
263
        fin.close()
264

265 266 267 268 269 270
    logger.info("[%s] result: " % eval_phase)
    get_score(pred_label_list, label_list, eval_phase)
    logger.info('loss is {}'.format(sum(total_cost) * 1.0 / len(total_cost)))
    logger.info("-----------------------------------------------------------")


271 272 273 274 275 276 277 278
def create_net(args,
               flow_data,
               class_dim,
               dict_dim,
               place,
               model_name="textcnn_net",
               is_infer=False):
    """[create network and loader]
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
    
    Arguments:
        flow_data {[type]} -- [description]
        class_dim {[type]} -- [description]
        dict_dim {[type]} -- [description]
        place {[type]} -- [description]
    
    Keyword Arguments:
        model_name {str} -- [description] (default: {"textcnn_net"})
        is_infer {bool} -- [description] (default: {False})
    
    Returns:
        [type] -- [description]
    """
    if model_name == "textcnn_net":
        model = textcnn_net_multi_label
    else:
        return
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
    char_list = fluid.data(
        name="char",
        shape=[None, args.max_seq_len, 1],
        dtype="int64",
        lod_level=0)
    label = fluid.data(
        name="label", shape=[None, class_dim], dtype="float32",
        lod_level=0)  # label data
    data_loader = fluid.io.DataLoader.from_generator(
        feed_list=[char_list, label],
        capacity=args.batch_size * 10,
        iterable=True,
        return_list=False)
    output = model(
        char_list,
        label,
        dict_dim,
        emb_dim=flow_data["model"]["emb_dim"],
        hid_dim=flow_data["model"]["hid_dim"],
        hid_dim2=flow_data["model"]["hid_dim2"],
        class_dim=class_dim,
        win_sizes=flow_data["model"]["win_sizes"],
        is_infer=is_infer,
        threshold=args.threshold,
        max_seq_len=args.max_seq_len)
322 323
    if is_infer:
        prediction = output
324
        return [data_loader, prediction]
325
    else:
326 327 328
        avg_cost, prediction, pred_label, label = output[0], output[1], output[
            2], output[3]
        return [data_loader, avg_cost, prediction, pred_label, label]
329

330 331 332

def build_data_loader(args, char_dict, intent_dict):
    """[decorate samples for dataloader]
333 334 335 336 337 338 339 340 341
    
    Arguments:
        args {[type]} -- [description]
        char_dict {[type]} -- [description]
        intent_dict {[type]} -- [description]
    
    Returns:
        [type] -- [description]
    """
342
    loader_res = {}
343 344 345 346 347 348
    if args.do_train:
        train_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        train_data_generator = train_processor.prepare_data(
            data_path=args.data_dir + "train.txt",
            batch_size=args.batch_size,
            mode='train')
349
        loader_res["train_data_generator"] = train_data_generator
350 351 352 353
        num_train_examples = train_processor._get_num_examples()
        logger.info("Num train examples: %d" % num_train_examples)
        logger.info("Num train steps: %d" % (math.ceil(num_train_examples * 1.0 / args.batch_size) * \
                                            args.epoch // DEV_COUNT))
354 355 356 357
        if math.ceil(num_train_examples * 1.0 /
                     args.batch_size) // DEV_COUNT <= 0:
            logger.error(
                "Num of train steps is less than 0  or equals to 0, exit")
358 359 360 361 362 363 364
            exit(1)
    if args.do_eval:
        eval_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        eval_data_generator = eval_processor.prepare_data(
            data_path=args.data_dir + "eval.txt",
            batch_size=args.batch_size,
            mode='eval')
365
        loader_res["eval_data_generator"] = eval_data_generator
366 367 368 369 370 371 372 373
        num_eval_examples = eval_processor._get_num_examples()
        logger.info("Num eval examples: %d" % num_eval_examples)
    if args.do_test:
        test_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        test_data_generator = test_processor.prepare_data(
            data_path=args.data_dir + "test.txt",
            batch_size=args.batch_size,
            mode='test')
374 375
        loader_res["test_data_generator"] = test_data_generator
    return loader_res
376 377


378 379
def build_graph(args, model_config, num_labels, dict_dim, place, test_place,
                loader_res):
380 381 382 383 384 385 386 387
    """[build paddle graph]
    
    Arguments:
        args {[type]} -- [description]
        model_config {[type]} -- [description]
        num_labels {[type]} -- [description]
        dict_dim {[type]} -- [description]
        place {[type]} -- [description]
388
        loader_res {[type]} -- [description]
389 390 391 392 393 394 395
    
    Returns:
        [type] -- [description]
    """
    res = {}
    cost, prediction, pred_label, label = None, None, None, None
    train_prog = fluid.default_main_program()
396

397 398 399 400 401 402 403 404
    startup_prog = fluid.default_startup_program()
    eval_prog = train_prog.clone(for_test=True)
    test_prog = train_prog.clone(for_test=True)
    train_prog.random_seed = args.random_seed
    startup_prog.random_seed = args.random_seed
    if args.do_train:
        with fluid.program_guard(train_prog, startup_prog):
            with fluid.unique_name.guard():
405
                train_data_loader, cost, prediction, pred_label, label = create_net(args, model_config, num_labels, \
406
                                                            dict_dim, place, model_name="textcnn_net")
407 408 409 410 411 412 413 414 415
                train_data_loader.set_sample_list_generator(
                    loader_res['train_data_generator'], places=place)
                res["train_data_loader"] = train_data_loader
                sgd_optimizer = fluid.optimizer.SGD(
                    learning_rate=fluid.layers.exponential_decay(
                        learning_rate=args.learning_rate,
                        decay_steps=1000,
                        decay_rate=0.5,
                        staircase=True))
416 417 418 419
                sgd_optimizer.minimize(cost)
    if args.do_eval:
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
420
                eval_data_loader, cost, prediction, pred_label, label = create_net(args, model_config, num_labels, \
A
aprilvkuo 已提交
421
                                                             dict_dim, test_place, model_name="textcnn_net")
422 423 424
                eval_data_loader.set_sample_list_generator(
                    loader_res['eval_data_generator'], places=test_place)
                res["eval_data_loader"] = eval_data_loader
425 426 427
    if args.do_test:
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
428
                test_data_loader, cost, prediction, pred_label, label = create_net(args, model_config, num_labels, \
A
aprilvkuo 已提交
429
                                                            dict_dim, test_place, model_name="textcnn_net")
430 431 432
                test_data_loader.set_sample_list_generator(
                    loader_res['test_data_generator'], places=test_place)
                res["test_data_loader"] = test_data_loader
433 434 435 436
    res["cost"] = cost
    res["prediction"] = prediction
    res["label"] = label
    res["pred_label"] = pred_label
437
    res["train_prog"] = train_prog
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
    res["eval_prog"] = eval_prog
    res["test_prog"] = test_prog

    return res


def main(args):
    """
    Main Function
    """
    global DEV_COUNT
    startup_prog = fluid.default_startup_program()
    random.seed(args.random_seed)
    model_config = ConfigReader.read_conf(args.config_path)
    if args.use_cuda:
A
aprilvkuo 已提交
453 454
        test_place = fluid.cuda_places(0)
        place = fluid.cuda_places()
455
        DEV_COUNT = len(place)
456
    else:
A
aprilvkuo 已提交
457
        test_place = fluid.cpu_places(1)
458
        os.environ['CPU_NUM'] = str(args.cpu_num)
A
aprilvkuo 已提交
459
        place = fluid.cpu_places()
460 461
        DEV_COUNT = args.cpu_num
    logger.info("Dev Num is %s" % str(DEV_COUNT))
A
aprilvkuo 已提交
462
    exe = fluid.Executor(place[0])
463 464 465 466 467 468 469 470 471 472 473
    if args.do_train and args.build_dict:
        DataProcesser.build_dict(args.data_dir + "train.txt", args.data_dir)
    # read dict
    char_dict = DataProcesser.read_dict(args.data_dir + "char.dict")
    dict_dim = len(char_dict)
    intent_dict = DataProcesser.read_dict(args.data_dir + "domain.dict")
    id2intent = {}
    for key, value in intent_dict.items():
        id2intent[int(value)] = key
    num_labels = len(intent_dict)
    # build model
474 475 476
    loader_res = build_data_loader(args, char_dict, intent_dict)
    build_res = build_graph(args, model_config, num_labels, dict_dim, place,
                            test_place, loader_res)
A
aprilvkuo 已提交
477 478
    build_res["place"] = place
    build_res["test_place"] = test_place
479 480 481
    if not (args.do_train or args.do_eval or args.do_test):
        raise ValueError("For args `do_train`, `do_eval` and `do_test`, at "
                         "least one of them must be True.")
482

483 484 485
    exe.run(startup_prog)
    if args.init_checkpoint and args.init_checkpoint != "None":
        try:
486 487
            init_checkpoint(
                exe, args.init_checkpoint, main_program=startup_prog)
488 489 490
            logger.info("Load model from %s" % args.init_checkpoint)
        except Exception as e:
            logger.exception(str(e))
491 492
            logger.error("Faild load model from %s [%s]" %
                         (args.init_checkpoint, str(e)))
A
aprilvkuo 已提交
493 494 495 496 497
    build_strategy = fluid.compiler.BuildStrategy()
    build_strategy.fuse_all_reduce_ops = False
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = 1
    # add compiled prog
498 499
    if args.do_train:
        compiled_prog = fluid.compiler.CompiledProgram(build_res["train_prog"]).with_data_parallel( \
A
aprilvkuo 已提交
500 501 502
                                                                    loss_name=build_res["cost"].name, \
                                                                    build_strategy=build_strategy, \
                                                                    exec_strategy=exec_strategy)
503
        build_res["compiled_prog"] = compiled_prog
A
aprilvkuo 已提交
504
    if args.do_test:
505 506
        test_compiled_prog = fluid.compiler.CompiledProgram(build_res[
            "test_prog"])
A
aprilvkuo 已提交
507
        build_res["test_compiled_prog"] = test_compiled_prog
508
    if args.do_eval:
509 510
        eval_compiled_prog = fluid.compiler.CompiledProgram(build_res[
            "eval_prog"])
A
aprilvkuo 已提交
511 512 513
        build_res["eval_compiled_prog"] = eval_compiled_prog

    if args.do_train:
514
        train(args, exe, build_res, place)
A
aprilvkuo 已提交
515
    if args.do_eval:
516
        evaluate(args, exe, build_res, "eval", \
517
                 save_result=True, id2intent=id2intent)
518 519 520
    if args.do_test:
        evaluate(args, exe, build_res, "test",\
                  save_result=True, id2intent=id2intent)
521 522 523 524 525 526 527


if __name__ == "__main__":
    logger.info("the paddle version is %s" % paddle.__version__)
    check_version('1.6.0')
    print_arguments(args)
    main(args)