# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Baidu's open-source Lexical Analysis tool for Chinese, including: 1. Word Segmentation, 2. Part-of-Speech Tagging 3. Named Entity Recognition """ from __future__ import print_function import os import sys import math import time import random import argparse import multiprocessing import numpy as np import paddle import paddle.fluid as fluid import reader import utils sys.path.append("../") from models.sequence_labeling import nets # yapf: disable parser = argparse.ArgumentParser(__doc__) # 1. model parameters model_g = utils.ArgumentGroup(parser, "model", "model configuration") model_g.add_arg("word_emb_dim", int, 128, "The dimension in which a word is embedded.") model_g.add_arg("grnn_hidden_dim", int, 256, "The number of hidden nodes in the GRNN layer.") model_g.add_arg("bigru_num", int, 2, "The number of bi_gru layers in the network.") # 2. data parameters data_g = utils.ArgumentGroup(parser, "data", "data paths") data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.") data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.") data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.") data_g.add_arg("train_data", str, "./data/train.tsv", "The folder where the training data is located.") data_g.add_arg("test_data", str, "./data/test.tsv", "The folder where the training data is located.") data_g.add_arg("infer_data", str, "./data/test.tsv", "The folder where the training data is located.") data_g.add_arg("model_save_dir", str, "./models", "The model will be saved in this path.") data_g.add_arg("init_checkpoint", str, "", "Path to init model") # 3. train parameters train_g = utils.ArgumentGroup(parser, "training", "training options") train_g.add_arg("do_train", bool, True, "whether to perform training") train_g.add_arg("do_test", bool, True, "whether to perform testing") train_g.add_arg("do_infer", bool, False, "whether to perform inference") train_g.add_arg("random_seed", int, 0, "random seed for training") train_g.add_arg("save_model_per_batches", int, 10000, "Save the model once per xxxx batch of training") train_g.add_arg("valid_model_per_batches", int, 1000, "Do the validation once per xxxx batch of training") train_g.add_arg("batch_size", int, 80, "The number of sequences contained in a mini-batch, " "or the maximum number of tokens (include paddings) contained in a mini-batch.") train_g.add_arg("epoch", int, 10, "corpus iteration num") train_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.") train_g.add_arg("traindata_shuffle_buffer", int, 200, "The buffer size used in shuffle the training data.") train_g.add_arg("base_learning_rate", float, 1e-3, "The basic learning rate that affects the entire network.") train_g.add_arg("emb_learning_rate", float, 5, "The real learning rate of the embedding layer will be (emb_learning_rate * base_learning_rate).") train_g.add_arg("crf_learning_rate", float, 0.2, "The real learning rate of the embedding layer will be (crf_learning_rate * base_learning_rate).") parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.') args = parser.parse_args() # yapf: enable. sys.path.append('../models/') from model_check import check_cuda check_cuda(args.use_cuda) print(args) def create_model(args, pyreader_name, vocab_size, num_labels): """create lac model""" pyreader = fluid.layers.py_reader( capacity=50, shapes=([-1, 1], [-1, 1]), dtypes=('int64', 'int64'), lod_levels=(1, 1), name=pyreader_name, use_double_buffer=False) words, targets = fluid.layers.read_file(pyreader) avg_cost, crf_decode = nets.lex_net(words, targets, args, vocab_size, num_labels) (precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval( input=crf_decode, label=targets, chunk_scheme="IOB", num_chunk_types=int(math.ceil((num_labels - 1) / 2.0))) chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator.reset() ret = { "pyreader":pyreader, "words":words, "targets":targets, "avg_cost":avg_cost, "crf_decode":crf_decode, "chunk_evaluator":chunk_evaluator, "num_infer_chunks":num_infer_chunks, "num_label_chunks":num_label_chunks, "num_correct_chunks":num_correct_chunks } return ret def evaluate(exe, test_program, test_ret): """evaluate for test data""" test_ret["pyreader"].start() test_ret["chunk_evaluator"].reset() loss = [] precision = [] recall = [] f1 = [] start_time = time.time() while True: try: avg_loss, nums_infer, nums_label, nums_correct = exe.run( test_program, fetch_list=[ test_ret["avg_cost"], test_ret["num_infer_chunks"], test_ret["num_label_chunks"], test_ret["num_correct_chunks"], ], ) loss.append(avg_loss) test_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct) p, r, f = test_ret["chunk_evaluator"].eval() precision.append(p) recall.append(r) f1.append(f) except fluid.core.EOFException: test_ret["pyreader"].reset() break end_time = time.time() print("[test] avg loss: %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time: %.3f s" % (np.mean(loss), np.mean(precision), np.mean(recall), np.mean(f1), end_time - start_time)) def main(args): startup_program = fluid.Program() if args.random_seed is not None: startup_program.random_seed = args.random_seed # prepare dataset dataset = reader.Dataset(args) if args.do_train: train_program = fluid.Program() if args.random_seed is not None: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): train_ret = create_model( args, "train_reader", dataset.vocab_size, dataset.num_labels) train_ret["pyreader"].decorate_paddle_reader( paddle.batch( paddle.reader.shuffle( dataset.file_reader(args.train_data), buf_size=args.traindata_shuffle_buffer ), batch_size=args.batch_size ) ) optimizer = fluid.optimizer.Adam(learning_rate=args.base_learning_rate) optimizer.minimize(train_ret["avg_cost"]) if args.do_test: test_program = fluid.Program() with fluid.program_guard(test_program, startup_program): with fluid.unique_name.guard(): test_ret = create_model( args, "test_reader", dataset.vocab_size, dataset.num_labels) test_ret["pyreader"].decorate_paddle_reader( paddle.batch( dataset.file_reader(args.test_data), batch_size=args.batch_size ) ) test_program = test_program.clone(for_test=True) # to share parameters with train model if args.do_infer: infer_program = fluid.Program() with fluid.program_guard(infer_program, startup_program): with fluid.unique_name.guard(): infer_ret = create_model( args, "infer_reader", dataset.vocab_size, dataset.num_labels) infer_ret["pyreader"].decorate_paddle_reader( paddle.batch( dataset.file_reader(args.infer_data), batch_size=args.batch_size ) ) infer_program = infer_program.clone(for_test=True) # init executor if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = multiprocessing.cpu_count() exe = fluid.Executor(place) exe.run(startup_program) # load checkpoints if args.do_train: if args.init_checkpoint: utils.init_checkpoint(exe, args.init_checkpoint, train_program) elif args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if only doing validation or testing!") utils.init_checkpoint(exe, args.init_checkpoint, test_program) if args.do_infer: utils.init_checkpoint(exe, args.init_checkpoint, infer_program) # do start to train if args.do_train: num_train_examples = dataset.get_num_examples(args.train_data) max_train_steps = args.epoch * num_train_examples // args.batch_size print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) ce_info = [] batch_id = 0 for epoch_id in range(args.epoch): train_ret["pyreader"].start() ce_time = 0 try: while True: start_time = time.time() avg_cost, nums_infer, nums_label, nums_correct = exe.run( train_program, fetch_list=[ train_ret["avg_cost"], train_ret["num_infer_chunks"], train_ret["num_label_chunks"], train_ret["num_correct_chunks"], ], ) end_time = time.time() train_ret["chunk_evaluator"].reset() train_ret["chunk_evaluator"].update(nums_infer, nums_label, nums_correct) precision, recall, f1_score = train_ret["chunk_evaluator"].eval() batch_id += 1 print("[train] batch_id = %d, loss = %.5f, P: %.5f, R: %.5f, F1: %.5f, elapsed time %.5f " % ( batch_id, avg_cost, precision, recall, f1_score, end_time - start_time)) ce_time += end_time - start_time ce_info.append([ce_time, avg_cost, precision, recall, f1_score]) # save checkpoints if (batch_id % args.save_model_per_batches == 0): save_path = os.path.join(args.model_save_dir, "step_" + str(batch_id)) fluid.io.save_persistables(exe, save_path, train_program) # evaluate if (batch_id % args.valid_model_per_batches == 0) and args.do_test: evaluate(exe, test_program, test_ret) except fluid.core.EOFException: save_path = os.path.join(args.model_save_dir, "step_" + str(batch_id)) fluid.io.save_persistables(exe, save_path, train_program) train_ret["pyreader"].reset() # break? if args.do_train and args.enable_ce: card_num = get_cards() ce_cost = 0 ce_f1 = 0 ce_p = 0 ce_r = 0 ce_time = 0 try: ce_time = ce_info[-2][0] ce_cost = ce_info[-2][1] ce_p = ce_info[-2][2] ce_r = ce_info[-2][3] ce_f1 = ce_info[-2][4] except: print("ce info error") print("kpis\teach_step_duration_card%s\t%s" % (card_num, ce_time)) print("kpis\ttrain_cost_card%s\t%f" % (card_num, ce_cost)) print("kpis\ttrain_precision_card%s\t%f" % (card_num, ce_p)) print("kpis\ttrain_recall_card%s\t%f" % (card_num, ce_r)) print("kpis\ttrain_f1_card%s\t%f" % (card_num, ce_f1)) # only test if args.do_test: evaluate(exe, test_program, test_ret) if args.do_infer: infer_ret["pyreader"].start() while True: try: (words, crf_decode, ) = exe.run(infer_program, fetch_list=[ infer_ret["words"], infer_ret["crf_decode"], ], return_numpy=False) results = utils.parse_result(words, crf_decode, dataset) for result in results: print(result) except fluid.core.EOFException: infer_ret["pyreader"].reset() break def get_cards(): num = 0 cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') if cards != '': num = len(cards.split(",")) return num if __name__ == "__main__": main(args)