diff --git a/fluid/sequence_tagging_for_ner/infer.py b/fluid/sequence_tagging_for_ner/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..ae1e88234625c14aee4af00b3200716190b539b8 --- /dev/null +++ b/fluid/sequence_tagging_for_ner/infer.py @@ -0,0 +1,62 @@ +import gzip +import numpy as np +import reader +import paddle.fluid as fluid +import paddle.v2 as paddle +from network_conf import ner_net +from utils import load_dict, load_reverse_dict + + +def infer(model_path, batch_size, test_data_file, vocab_file, target_file): + word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + + word_dict = load_dict(vocab_file) + word_reverse_dict = load_reverse_dict(vocab_file) + + label_dict = load_dict(target_file) + label_reverse_dict = load_reverse_dict(target_file) + + test_data = paddle.batch( + reader.data_reader(test_data_file, word_dict, label_dict), + batch_size=batch_size) + place = fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) + exe = fluid.Executor(place) + + inference_scope = fluid.core.Scope() + with fluid.scope_guard(inference_scope): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(model_path, exe) + for data in test_data(): + crf_decode = exe.run(inference_program, + feed=feeder.feed(data), + fetch_list=fetch_targets, + return_numpy=False) + lod_info = (crf_decode[0].lod())[0] + np_data = np.array(crf_decode[0]) + assert len(data) == len(lod_info) - 1 + for sen_index in xrange(len(data)): + assert len(data[sen_index][0]) == lod_info[ + sen_index + 1] - lod_info[sen_index] + word_index = 0 + for tag_index in xrange(lod_info[sen_index], + lod_info[sen_index + 1]): + word = word_reverse_dict[data[sen_index][0][word_index]] + gold_tag = label_reverse_dict[data[sen_index][2][ + word_index]] + tag = label_reverse_dict[np_data[tag_index][0]] + print word + "\t" + gold_tag + "\t" + tag + word_index += 1 + print "" + + +if __name__ == "__main__": + infer( + model_path="models/params_pass_0", + batch_size=6, + test_data_file="data/test", + vocab_file="data/vocab.txt", + target_file="data/target.txt") diff --git a/fluid/sequence_tagging_for_ner/network_conf.py b/fluid/sequence_tagging_for_ner/network_conf.py index df23000399f1474ae6a9e4f79d94602aa420a360..5eaa704f67641bd9bb98bbac162a0adb7a72c246 100644 --- a/fluid/sequence_tagging_for_ner/network_conf.py +++ b/fluid/sequence_tagging_for_ner/network_conf.py @@ -1,10 +1,12 @@ +import math + import paddle.fluid as fluid from paddle.fluid.initializer import NormalInitializer + from utils import logger, load_dict, get_embedding -import math -def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True): +def ner_net(word_dict_len, label_dict_len, parallel, stack_num=2): mark_dict_len = 2 word_dim = 50 mark_dim = 5 @@ -12,92 +14,83 @@ def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True): IS_SPARSE = True embedding_name = 'emb' - word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) - - word_embedding = fluid.layers.embedding( - input=word, - size=[word_dict_len, word_dim], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr=fluid.ParamAttr( - name=embedding_name, trainable=False)) - - mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1) + def _net_conf(word, mark, target): + word_embedding = fluid.layers.embedding( + input=word, + size=[word_dict_len, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) - mark_embedding = fluid.layers.embedding( - input=mark, - size=[mark_dict_len, mark_dim], - dtype='float32', - is_sparse=IS_SPARSE) + mark_embedding = fluid.layers.embedding( + input=mark, + size=[mark_dict_len, mark_dim], + dtype='float32', + is_sparse=IS_SPARSE) - word_caps_vector = fluid.layers.concat( - input=[word_embedding, mark_embedding], axis=1) - mix_hidden_lr = 1 + word_caps_vector = fluid.layers.concat( + input=[word_embedding, mark_embedding], axis=1) + mix_hidden_lr = 1 - rnn_para_attr = fluid.ParamAttr( - initializer=NormalInitializer( - loc=0.0, scale=0.0, seed=0), - learning_rate=mix_hidden_lr) - hidden_para_attr = fluid.ParamAttr( - initializer=NormalInitializer( - loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3), seed=0), - learning_rate=mix_hidden_lr) + rnn_para_attr = fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=0.0), + learning_rate=mix_hidden_lr) + hidden_para_attr = fluid.ParamAttr( + initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)), + learning_rate=mix_hidden_lr) - hidden = fluid.layers.fc( - input=word_caps_vector, - name="__hidden00__", - size=hidden_dim, - act="tanh", - bias_attr=fluid.ParamAttr(initializer=NormalInitializer( - loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3), seed=0)), - param_attr=fluid.ParamAttr(initializer=NormalInitializer( - loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3), seed=0))) - fea = [] - for direction in ["fwd", "bwd"]: - for i in range(stack_num): - if i != 0: - hidden = fluid.layers.fc( - name="__hidden%02d_%s__" % (i, direction), + hidden = fluid.layers.fc( + input=word_caps_vector, + name="__hidden00__", + size=hidden_dim, + act="tanh", + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))), + param_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)))) + fea = [] + for direction in ["fwd", "bwd"]: + for i in range(stack_num): + if i != 0: + hidden = fluid.layers.fc( + name="__hidden%02d_%s__" % (i, direction), + size=hidden_dim, + act="stanh", + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=1.0)), + input=[hidden, rnn[0], rnn[1]], + param_attr=[ + hidden_para_attr, rnn_para_attr, rnn_para_attr + ]) + rnn = fluid.layers.dynamic_lstm( + name="__rnn%02d_%s__" % (i, direction), + input=hidden, size=hidden_dim, - act="stanh", + candidate_activation='relu', + gate_activation='sigmoid', + cell_activation='sigmoid', bias_attr=fluid.ParamAttr(initializer=NormalInitializer( - loc=0.0, scale=1.0, seed=0)), - input=[hidden, rnn[0], rnn[1]], - param_attr=[ - hidden_para_attr, rnn_para_attr, rnn_para_attr - ]) - rnn = fluid.layers.dynamic_lstm( - name="__rnn%02d_%s__" % (i, direction), - input=hidden, - size=hidden_dim, - candidate_activation='relu', - gate_activation='sigmoid', - cell_activation='sigmoid', - bias_attr=fluid.ParamAttr(initializer=NormalInitializer( - loc=0.0, scale=1.0, seed=0)), - is_reverse=(i % 2) if direction == "fwd" else not i % 2, - param_attr=rnn_para_attr) - fea += [hidden, rnn[0], rnn[1]] + loc=0.0, scale=1.0)), + is_reverse=(i % 2) if direction == "fwd" else not i % 2, + param_attr=rnn_para_attr) + fea += [hidden, rnn[0], rnn[1]] - rnn_fea = fluid.layers.fc( - size=hidden_dim, - bias_attr=fluid.ParamAttr(initializer=NormalInitializer( - loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3), seed=0)), - act="stanh", - input=fea, - param_attr=[hidden_para_attr, rnn_para_attr, rnn_para_attr] * 2) + rnn_fea = fluid.layers.fc( + size=hidden_dim, + bias_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))), + act="stanh", + input=fea, + param_attr=[hidden_para_attr, rnn_para_attr, rnn_para_attr] * 2) - emission = fluid.layers.fc(size=label_dict_len, - input=rnn_fea, - param_attr=fluid.ParamAttr( - initializer=NormalInitializer( - loc=0.0, - scale=(1. / math.sqrt(hidden_dim) / 3), - seed=0))) - - if is_train: - target = fluid.layers.data( - name="target", shape=[1], dtype='int64', lod_level=1) + emission = fluid.layers.fc( + size=label_dict_len, + input=rnn_fea, + param_attr=fluid.ParamAttr(initializer=NormalInitializer( + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)))) crf_cost = fluid.layers.linear_chain_crf( input=emission, @@ -105,11 +98,30 @@ def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True): param_attr=fluid.ParamAttr( name='crfw', initializer=NormalInitializer( - loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3), seed=0), + loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)), learning_rate=mix_hidden_lr)) - return crf_cost, emission, word, mark, target + avg_cost = fluid.layers.mean(x=crf_cost) + return avg_cost, emission + word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1) + target = fluid.layers.data( + name="target", shape=[1], dtype='int64', lod_level=1) + + if parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + word_ = pd.read_input(word) + mark_ = pd.read_input(mark) + target_ = pd.read_input(target) + avg_cost, emission_base = _net_conf(word_, mark_, target_) + pd.write_output(avg_cost) + pd.write_output(emission_base) + avg_cost_list, emission = pd() + avg_cost = fluid.layers.mean(x=avg_cost_list) + emission.stop_gradient = True else: - predict = fluid.layers.crf_decoding( - input=emission, param_attr=fluid.ParamAttr(name='crfw')) - return predict + avg_cost, emission = _net_conf(word, mark, target) + + return avg_cost, emission, word, mark, target diff --git a/fluid/sequence_tagging_for_ner/reader.py b/fluid/sequence_tagging_for_ner/reader.py index 5050d0bf499e59db505758b0af9eed71e6af7de7..a817dd199987ae0050014595296fe4717ab198e4 100644 --- a/fluid/sequence_tagging_for_ner/reader.py +++ b/fluid/sequence_tagging_for_ner/reader.py @@ -1,8 +1,7 @@ """ Conll03 dataset. """ - -from utils import * +import re __all__ = ["data_reader"] diff --git a/fluid/sequence_tagging_for_ner/train.py b/fluid/sequence_tagging_for_ner/train.py index 87ecae6c070372eb6b977ad240ed006c688a27db..02589f34bcc1ce792ae938c6228524c612df34f9 100644 --- a/fluid/sequence_tagging_for_ner/train.py +++ b/fluid/sequence_tagging_for_ner/train.py @@ -1,11 +1,13 @@ +import os +import math + +import numpy as np import paddle.v2 as paddle import paddle.fluid as fluid + +import reader from network_conf import ner_net from utils import logger, load_dict, get_embedding -import reader -import os -import math -import numpy as np def to_lodtensor(data, place): @@ -36,17 +38,12 @@ def test(exe, chunk_evaluator, inference_program, test_data, place): return chunk_evaluator.eval(exe) -def main(train_data_file, - test_data_file, - vocab_file, - target_file, - emb_file, - model_save_dir, - num_passes=100, - batch_size=64): +def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, + model_save_dir, num_passes, use_gpu, parallel): if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) + BATCH_SIZE = 200 word_dict = load_dict(vocab_file) label_dict = load_dict(target_file) @@ -55,11 +52,10 @@ def main(train_data_file, word_dict_len = len(word_dict) label_dict_len = len(label_dict) - crf_cost, feature_out, word, mark, target = ner_net(word_dict_len, - label_dict_len) - avg_cost = fluid.layers.mean(x=crf_cost) + avg_cost, feature_out, word, mark, target = ner_net( + word_dict_len, label_dict_len, parallel) - sgd_optimizer = fluid.optimizer.Momentum(momentum=0.0, learning_rate=1e-3) + sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3) sgd_optimizer.minimize(avg_cost) crf_decode = fluid.layers.crf_decoding( @@ -79,15 +75,16 @@ def main(train_data_file, train_reader = paddle.batch( paddle.reader.shuffle( reader.data_reader(train_data_file, word_dict, label_dict), - buf_size=1000), - batch_size=batch_size) + buf_size=20000), + batch_size=BATCH_SIZE) test_reader = paddle.batch( paddle.reader.shuffle( reader.data_reader(test_data_file, word_dict, label_dict), - buf_size=1000), - batch_size=batch_size) + buf_size=20000), + batch_size=BATCH_SIZE) - place = fluid.CPUPlace() + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + #place = fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) exe = fluid.Executor(place) @@ -101,27 +98,33 @@ def main(train_data_file, for pass_id in xrange(num_passes): chunk_evaluator.reset(exe) for data in train_reader(): - cost, precision, recall, f1_score = exe.run( + print len(data) + cost, batch_precision, batch_recall, batch_f1_score = exe.run( fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost] + chunk_evaluator.metrics) if batch_id % 5 == 0: - print("Pass " + str(pass_id) + ", Batch " + str(batch_id) + - ", Cost " + str(cost) + ", Precision " + str(precision) + - ", Recall " + str(recall) + ", F1_score" + str(f1_score)) + print( + "Pass " + str(pass_id) + ", Batch " + str(batch_id) + + ", Cost " + str(cost[0]) + ", Precision " + + str(batch_precision[0]) + ", Recall " + str(batch_recall[0]) + + ", F1_score" + str(batch_f1_score[0])) batch_id = batch_id + 1 - pass_precision, pass_recall, pass_f1_score = test( - exe, chunk_evaluator, inference_program, train_reader, place) - print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str( - pass_precision) + " pass_recall:" + str(pass_recall) + + pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) + print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score)) pass_precision, pass_recall, pass_f1_score = test( exe, chunk_evaluator, inference_program, test_reader, place) - print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str( - pass_precision) + " pass_recall:" + str(pass_recall) + + print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score)) + save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) + fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'], + [crf_decode], exe) + if __name__ == "__main__": main( @@ -130,5 +133,7 @@ if __name__ == "__main__": vocab_file="data/vocab.txt", target_file="data/target.txt", emb_file="data/wordVectors.txt", - model_save_dir="models/", - num_passes=1000) + model_save_dir="models", + num_passes=1000, + use_gpu=False, + parallel=True) diff --git a/fluid/sequence_tagging_for_ner/utils.py b/fluid/sequence_tagging_for_ner/utils.py index c57422c20b88439098cedeb7a9cb4532d4582664..ef90da931f9a695d5b13fccc47acf462976cdb69 100644 --- a/fluid/sequence_tagging_for_ner/utils.py +++ b/fluid/sequence_tagging_for_ner/utils.py @@ -1,11 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import logging -import os -import re -import argparse + import numpy as np -from collections import defaultdict logger = logging.getLogger("paddle") logger.setLevel(logging.INFO)