diff --git a/fluid/sequence_tagging_for_ner/README.md b/fluid/sequence_tagging_for_ner/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1f634da4e2e385b06589cde0c6979812ff52e450 --- /dev/null +++ b/fluid/sequence_tagging_for_ner/README.md @@ -0,0 +1,120 @@ +# 命名实体识别 + +以下是本例的简要目录结构及说明: + +```text +. +├── data # 存储运行本例所依赖的数据,从外部获取 +├── network_conf.py # 模型定义 +├── reader.py # 数据读取接口, 从外部获取 +├── README.md # 文档 +├── train.py # 训练脚本 +├── infer.py # 预测脚本 +├── utils.py # 定义通用的函数, 从外部获取 +└── utils_extend.py # 对utils.py的拓展 +``` + + +## 简介,模型详解 + +在PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中对于命名实体识别任务有较详细的介绍,在本例中不再重复介绍。 +在模型上,我们沿用了v2版本的模型结构,唯一区别是我们使用LSTM代替原始的RNN。 + +## 数据获取 + +请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中数据获取方式,将该例中的data文件夹拷贝至本例目录下,运行其中的download.sh脚本获取训练和测试数据。 + +## 通用脚本获取 + +请将PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中提供的用于数据读取的文件[reader.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/reader.py)以及包含字典导入等通用功能的文件[utils.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/utils.py)复制到本目录下。本例将会使用到这两个脚本。 + +## 训练 + +1. 运行 `sh data/download.sh` +2. 修改 `train.py` 的 `main` 函数,指定数据路径 + + ```python + main( + train_data_file="data/train", + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + emb_file="data/wordVectors.txt", + model_save_dir="models", + num_passes=1000, + use_gpu=False, + parallel=False) + ``` + +3. 运行命令 `python train.py` ,**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。** + + ```text + Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245 + Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303 + Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352 + Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685 + Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293 + Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203 + Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561 + Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854 + Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075 + Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686 + Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896 + Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437 + Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684 + Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962 + Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518 + [TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433] + [TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376] + ``` +## 预测 +1. 修改 [infer.py](./infer.py) 的 `infer` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下: + + ```python + infer( + model_path="models/params_pass_0", + batch_size=6, + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + use_gpu=False + ) + ``` + +2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果): + + ```text + leicestershire B-ORG B-LOC + extended O O + their O O + first O O + innings O O + by O O + DGDG O O + runs O O + before O O + being O O + bowled O O + out O O + for O O + 296 O O + with O O + england B-LOC B-LOC + discard O O + andy B-PER B-PER + caddick I-PER I-PER + taking O O + three O O + for O O + DGDG O O + . O O + ``` + + 输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。 + +## 结果示例 + +

+
+图1. 学习曲线, 横轴表示训练轮数,纵轴表示F1值 +

diff --git a/fluid/sequence_tagging_for_ner/imgs/convergence_curve.png b/fluid/sequence_tagging_for_ner/imgs/convergence_curve.png new file mode 100644 index 0000000000000000000000000000000000000000..6b862b751dd7ec0ef761dce78b9515769366d5f4 Binary files /dev/null and b/fluid/sequence_tagging_for_ner/imgs/convergence_curve.png differ diff --git a/fluid/sequence_tagging_for_ner/infer.py b/fluid/sequence_tagging_for_ner/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0bd9496ed2ec1db019a0124905093e0b12531a --- /dev/null +++ b/fluid/sequence_tagging_for_ner/infer.py @@ -0,0 +1,71 @@ +import numpy as np + +import paddle.fluid as fluid +import paddle.v2 as paddle + +from network_conf import ner_net +import reader +from utils import load_dict, load_reverse_dict +from utils_extend import to_lodtensor + + +def infer(model_path, batch_size, test_data_file, vocab_file, target_file, + use_gpu): + """ + use the model under model_path to predict the test data, the result will be printed on the screen + + return nothing + """ + word_dict = load_dict(vocab_file) + word_reverse_dict = load_reverse_dict(vocab_file) + + label_dict = load_dict(target_file) + label_reverse_dict = load_reverse_dict(target_file) + + test_data = paddle.batch( + reader.data_reader(test_data_file, word_dict, label_dict), + batch_size=batch_size) + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + for data in test_data(): + word = to_lodtensor(map(lambda x: x[0], data), place) + mark = to_lodtensor(map(lambda x: x[1], data), place) + target = to_lodtensor(map(lambda x: x[2], data), place) + crf_decode = exe.run( + inference_program, + feed={"word": word, + "mark": mark, + "target": target}, + fetch_list=fetch_targets, + return_numpy=False) + lod_info = (crf_decode[0].lod())[0] + np_data = np.array(crf_decode[0]) + assert len(data) == len(lod_info) - 1 + for sen_index in xrange(len(data)): + assert len(data[sen_index][0]) == lod_info[ + sen_index + 1] - lod_info[sen_index] + word_index = 0 + for tag_index in xrange(lod_info[sen_index], + lod_info[sen_index + 1]): + word = word_reverse_dict[data[sen_index][0][word_index]] + gold_tag = label_reverse_dict[data[sen_index][2][ + word_index]] + tag = label_reverse_dict[np_data[tag_index][0]] + print word + "\t" + gold_tag + "\t" + tag + word_index += 1 + print "" + + +if __name__ == "__main__": + infer( + model_path="models/params_pass_0", + batch_size=6, + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + use_gpu=False) diff --git a/fluid/sequence_tagging_for_ner/network_conf.py b/fluid/sequence_tagging_for_ner/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..5eaa704f67641bd9bb98bbac162a0adb7a72c246 --- /dev/null +++ b/fluid/sequence_tagging_for_ner/network_conf.py @@ -0,0 +1,127 @@ +import math + +import paddle.fluid as fluid +from paddle.fluid.initializer import NormalInitializer + +from utils import logger, load_dict, get_embedding + + +def ner_net(word_dict_len, label_dict_len, parallel, stack_num=2): + mark_dict_len = 2 + word_dim = 50 + mark_dim = 5 + hidden_dim = 300 + IS_SPARSE = True + embedding_name = 'emb' + + def _net_conf(word, mark, target): + word_embedding = fluid.layers.embedding( + input=word, + size=[word_dict_len, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) + + mark_embedding = fluid.layers.embedding( + input=mark, + size=[mark_dict_len, mark_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + word_caps_vector = fluid.layers.concat( + input=[word_embedding, mark_embedding], axis=1) + mix_hidden_lr = 1 + + rnn_para_attr = fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=0.0), + learning_rate=mix_hidden_lr) + hidden_para_attr = fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)), + learning_rate=mix_hidden_lr) + + hidden = fluid.layers.fc( + input=word_caps_vector, + name="__hidden00__", + size=hidden_dim, + act="tanh", + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))), + param_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)))) + fea = [] + for direction in ["fwd", "bwd"]: + for i in range(stack_num): + if i != 0: + hidden = fluid.layers.fc( + name="__hidden%02d_%s__" % (i, direction), + size=hidden_dim, + act="stanh", + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=1.0)), + input=[hidden, rnn[0], rnn[1]], + param_attr=[ + hidden_para_attr, rnn_para_attr, rnn_para_attr + ]) + rnn = fluid.layers.dynamic_lstm( + name="__rnn%02d_%s__" % (i, direction), + input=hidden, + size=hidden_dim, + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid', + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=1.0)), + is_reverse=(i % 2) if direction == "fwd" else not i % 2, + param_attr=rnn_para_attr) + fea += [hidden, rnn[0], rnn[1]] + + rnn_fea = fluid.layers.fc( + size=hidden_dim, + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))), + act="stanh", + input=fea, + param_attr=[hidden_para_attr, rnn_para_attr, rnn_para_attr] * 2) + + emission = fluid.layers.fc( + size=label_dict_len, + input=rnn_fea, + param_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)))) + + crf_cost = fluid.layers.linear_chain_crf( + input=emission, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', + initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)), + learning_rate=mix_hidden_lr)) + avg_cost = fluid.layers.mean(x=crf_cost) + return avg_cost, emission + + word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1) + target = fluid.layers.data( + name="target", shape=[1], dtype='int64', lod_level=1) + + if parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + word_ = pd.read_input(word) + mark_ = pd.read_input(mark) + target_ = pd.read_input(target) + avg_cost, emission_base = _net_conf(word_, mark_, target_) + pd.write_output(avg_cost) + pd.write_output(emission_base) + avg_cost_list, emission = pd() + avg_cost = fluid.layers.mean(x=avg_cost_list) + emission.stop_gradient = True + else: + avg_cost, emission = _net_conf(word, mark, target) + + return avg_cost, emission, word, mark, target diff --git a/fluid/sequence_tagging_for_ner/train.py b/fluid/sequence_tagging_for_ner/train.py new file mode 100644 index 0000000000000000000000000000000000000000..6ed77cd5ca1d504a8b79b4f87349242b5051c539 --- /dev/null +++ b/fluid/sequence_tagging_for_ner/train.py @@ -0,0 +1,122 @@ +import os +import math +import numpy as np + +import paddle.v2 as paddle +import paddle.fluid as fluid + +import reader +from network_conf import ner_net +from utils import logger, load_dict +from utils_extend import to_lodtensor, get_embedding + + +def test(exe, chunk_evaluator, inference_program, test_data, place): + chunk_evaluator.reset(exe) + for data in test_data(): + word = to_lodtensor(map(lambda x: x[0], data), place) + mark = to_lodtensor(map(lambda x: x[1], data), place) + target = to_lodtensor(map(lambda x: x[2], data), place) + acc = exe.run(inference_program, + feed={"word": word, + "mark": mark, + "target": target}) + return chunk_evaluator.eval(exe) + + +def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, + model_save_dir, num_passes, use_gpu, parallel): + if not os.path.exists(model_save_dir): + os.mkdir(model_save_dir) + + BATCH_SIZE = 200 + word_dict = load_dict(vocab_file) + label_dict = load_dict(target_file) + + word_vector_values = get_embedding(emb_file) + + word_dict_len = len(word_dict) + label_dict_len = len(label_dict) + + avg_cost, feature_out, word, mark, target = ner_net( + word_dict_len, label_dict_len, parallel) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + sgd_optimizer.minimize(avg_cost) + + crf_decode = fluid.layers.crf_decoding( + input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) + + chunk_evaluator = fluid.evaluator.ChunkEvaluator( + input=crf_decode, + label=target, + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) + + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + test_target = chunk_evaluator.metrics + chunk_evaluator.states + inference_program = fluid.io.get_inference_program(test_target) + + train_reader = paddle.batch( + paddle.reader.shuffle( + reader.data_reader(train_data_file, word_dict, label_dict), + buf_size=20000), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.reader.shuffle( + reader.data_reader(test_data_file, word_dict, label_dict), + buf_size=20000), + batch_size=BATCH_SIZE) + + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + + embedding_name = 'emb' + embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor() + embedding_param.set(word_vector_values, place) + + batch_id = 0 + for pass_id in xrange(num_passes): + chunk_evaluator.reset(exe) + for data in train_reader(): + cost, batch_precision, batch_recall, batch_f1_score = exe.run( + fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + chunk_evaluator.metrics) + if batch_id % 5 == 0: + print("Pass " + str(pass_id) + ", Batch " + str( + batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str( + batch_precision[0]) + ", Recall " + str(batch_recall[0]) + + ", F1_score" + str(batch_f1_score[0])) + batch_id = batch_id + 1 + + pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) + print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str( + pass_precision) + " pass_recall:" + str(pass_recall) + + " pass_f1_score:" + str(pass_f1_score)) + pass_precision, pass_recall, pass_f1_score = test( + exe, chunk_evaluator, inference_program, test_reader, place) + print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str( + pass_precision) + " pass_recall:" + str(pass_recall) + + " pass_f1_score:" + str(pass_f1_score)) + + save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) + fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'], + [crf_decode], exe) + + +if __name__ == "__main__": + main( + train_data_file="data/train", + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt", + emb_file="data/wordVectors.txt", + model_save_dir="models", + num_passes=1000, + use_gpu=False, + parallel=False) diff --git a/fluid/sequence_tagging_for_ner/utils_extend.py b/fluid/sequence_tagging_for_ner/utils_extend.py new file mode 100644 index 0000000000000000000000000000000000000000..03e7e62fd5f8496d4a9436ad34ec7763b46b460d --- /dev/null +++ b/fluid/sequence_tagging_for_ner/utils_extend.py @@ -0,0 +1,28 @@ +import numpy as np + +import paddle.fluid as fluid + + +def get_embedding(emb_file='data/wordVectors.txt'): + """ + Get the trained word vector. + """ + return np.loadtxt(emb_file, dtype='float32') + + +def to_lodtensor(data, place): + """ + convert data to lodtensor + """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res