提交 cee7f2b9 编写于 作者: G guosheng

change according to comments on commit 76fe6fc

上级 29b56eec
#命名实体识别
##背景说明
命名实体识别(Named Entity Recognition,NER)又称作“专名识别”,是指识别文本中具有特定意义的实体,主要包括人名、地名、机构名、专有名词等,是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分,可以将其作为序列标注问题,根据序列标注结果可以直接得到实体边界和实体类别。
##数据说明
在本示例中,我们将使用CoNLL 2003 NER任务中开放出的数据集。由于版权原因,我们暂不提供此数据集的下载,可以按照[此页面](http://www.clips.uantwerpen.be/conll2003/ner/)中的说明免费获取该数据。该数据集中训练和测试数据格式如下
<img src="image/data_format.png" width = "60%" align=center /><br>
其中第一列为原始句子序列,第四列为采用了I-TYPE方式表示的NER标签(I-TYPE和[BIO方式](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)的主要区别在于语块开始标记的使用上,I-TYPE只有在出现相邻的同类别实体时对后者使用B标记,其他均使用I标记),而我们这里将使用BIO方式表示的标签集,这两种方式的转换过程在我们提供的`conll03.py`文件中进行。另外,我们针对此数据集提供了word词典、label词典和预训练的词向量三个文件,可以直接下载使用。
##模型说明
在本示例中,我们所使用的模型结构如图1所示,更多关于序列标注网络模型的知识可见[此页面](https://github.com/PaddlePaddle/book/tree/develop/07.label_semantic_roles)
<div align="center">
<img src="image/ner_network.png" width = "60%" align=center /><br>
图1. NER模型网络结构
</div>
##使用说明
在获取到上文提到的数据集和文件资源后,将`ner.py`中如下的数据设置部分进行更改
```python
# init dataset
train_data_file = 'data/train'
test_data_file = 'data/test'
vocab_file = 'data/vocab.txt'
target_file = 'data/target.txt'
emb_file = 'data/wordVectors.txt'
```
TBD
......@@ -81,23 +81,24 @@ def corpus_reader(filename='data/train'):
else:
segs = line.strip().split()
sentence.append(segs[0])
labels.append(segs[-1])
# transform from I-TYPE to BIO schema
if segs[-1] != 'O' and (len(labels) == 0 or
labels[-1][1:] != segs[-1][1:]):
labels.append('B' + segs[-1][1:])
else:
labels.append(segs[-1])
f.close()
return reader
def reader_creator(corpus_reader=corpus_reader('data/train'),
word_dict=load_dict('data/vocab.txt'),
label_dict=load_dict('data/target.txt')):
def reader_creator(corpus_reader, word_dict, label_dict):
"""
Conll03 train set creator.
Because the training dataset is not free, the test dataset is used for
training. It returns a reader creator, each sample in the reader is nine
features, including sentence sequence, predicate, predicate context,
predicate context flag and tagged sequence.
The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
It returns a reader creator, each sample in the reader includes sentence sequence and tagged sequence.
:return: Training reader creator
:rtype: callable
......@@ -105,7 +106,6 @@ def reader_creator(corpus_reader=corpus_reader('data/train'),
def reader():
for sentence, labels in corpus_reader():
#word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
word_idx = [
word_dict.get(canonicalize_word(w, word_dict), UNK_IDX)
for w in sentence
......@@ -116,15 +116,19 @@ def reader_creator(corpus_reader=corpus_reader('data/train'),
return reader
def train():
def train(data_file='data/train',
vocab_file='data/vocab.txt',
target_file='data/target.txt'):
return reader_creator(
corpus_reader('data/train'),
word_dict=load_dict('data/vocab.txt'),
label_dict=load_dict('data/target.txt'))
corpus_reader(data_file),
word_dict=load_dict(vocab_file),
label_dict=load_dict(target_file))
def test():
def test(data_file='data/test',
vocab_file='data/vocab.txt',
target_file='data/target.txt'):
return reader_creator(
corpus_reader('data/test'),
word_dict=load_dict('data/vocab.txt'),
label_dict=load_dict('data/target.txt'))
corpus_reader(data_file),
word_dict=load_dict(vocab_file),
label_dict=load_dict(target_file))
......@@ -5,13 +5,22 @@ import paddle.v2.evaluator as evaluator
import conll03
import itertools
word_dict, label_dict = conll03.get_dict()
# init dataset
train_data_file = 'data/train'
test_data_file = 'data/test'
vocab_file = 'data/vocab.txt'
target_file = 'data/target.txt'
emb_file = 'data/wordVectors.txt'
word_dict, label_dict = conll03.get_dict(vocab_file, target_file)
word_vector_values = conll03.get_embedding(emb_file)
train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
# init hyper-params
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
word_dim = 50
caps_dim = 5
context_length = 5
hidden_dim = 300
mix_hidden_lr = 1e-3
......@@ -26,17 +35,14 @@ def d_type(size):
return paddle.data_type.integer_value_sequence(size)
def ner_net():
def ner_net(is_train):
word = paddle.layer.data(name='word', type=d_type(word_dict_len))
#ws = paddle.layer.data(name='ws', type=d_type(num_ws))
word_embedding = paddle.layer.mixed(
name='word_embedding',
size=word_dim,
input=paddle.layer.table_projection(input=word, param_attr=emb_para))
#ws_embedding = paddle.layer.mixed(name='ws_embedding', size=caps_dim,
# input=paddle.layer.table_projection(input=ws))
emb_layers = [word_embedding] #[word_embedding, ws_embedding]
emb_layers = [word_embedding]
word_caps_vector = paddle.layer.concat(
name='word_caps_vector', input=emb_layers)
......@@ -49,27 +55,23 @@ def ner_net():
input=word_caps_vector, param_attr=std_default)
])
lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
hidden_para_attr = paddle.attr.Param(
initial_std=default_std, learning_rate=mix_hidden_lr)
lstm_1_1 = paddle.layer.lstmemory(
rnn_1_1 = paddle.layer.recurrent(
name='rnn1-1',
input=hidden_1,
act=paddle.activation.Relu(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Sigmoid(),
bias_attr=std_0,
param_attr=lstm_para_attr)
lstm_1_2 = paddle.layer.lstmemory(
param_attr=rnn_para_attr)
rnn_1_2 = paddle.layer.recurrent(
name='rnn1-2',
input=hidden_1,
act=paddle.activation.Relu(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Sigmoid(),
reverse=1,
bias_attr=std_0,
param_attr=lstm_para_attr)
param_attr=rnn_para_attr)
hidden_2_1 = paddle.layer.mixed(
size=hidden_dim,
......@@ -78,7 +80,7 @@ def ner_net():
paddle.layer.full_matrix_projection(
input=hidden_1, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=lstm_1_1, param_attr=lstm_para_attr)
input=rnn_1_1, param_attr=rnn_para_attr)
])
hidden_2_2 = paddle.layer.mixed(
size=hidden_dim,
......@@ -87,26 +89,22 @@ def ner_net():
paddle.layer.full_matrix_projection(
input=hidden_1, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=lstm_1_2, param_attr=lstm_para_attr)
input=rnn_1_2, param_attr=rnn_para_attr)
])
lstm_2_1 = paddle.layer.lstmemory(
rnn_2_1 = paddle.layer.recurrent(
name='rnn2-1',
input=hidden_2_1,
act=paddle.activation.Relu(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Sigmoid(),
reverse=1,
bias_attr=std_0,
param_attr=lstm_para_attr)
lstm_2_2 = paddle.layer.lstmemory(
param_attr=rnn_para_attr)
rnn_2_2 = paddle.layer.recurrent(
name='rnn2-2',
input=hidden_2_2,
act=paddle.activation.Relu(),
gate_act=paddle.activation.Sigmoid(),
state_act=paddle.activation.Sigmoid(),
bias_attr=std_0,
param_attr=lstm_para_attr)
param_attr=rnn_para_attr)
hidden_3 = paddle.layer.mixed(
name='hidden3',
......@@ -116,11 +114,11 @@ def ner_net():
paddle.layer.full_matrix_projection(
input=hidden_2_1, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=lstm_2_1,
param_attr=lstm_para_attr), paddle.layer.full_matrix_projection(
input=rnn_2_1,
param_attr=rnn_para_attr), paddle.layer.full_matrix_projection(
input=hidden_2_2, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=lstm_2_2, param_attr=lstm_para_attr)
input=rnn_2_2, param_attr=rnn_para_attr)
])
output = paddle.layer.mixed(
......@@ -132,36 +130,42 @@ def ner_net():
input=hidden_3, param_attr=std_default)
])
target = paddle.layer.data(name='target', type=d_type(label_dict_len))
if is_train:
target = paddle.layer.data(name='target', type=d_type(label_dict_len))
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=output,
label=target,
param_attr=paddle.attr.Param(
name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=output,
label=target,
param_attr=paddle.attr.Param(
name='crfw',
initial_std=default_std,
learning_rate=mix_hidden_lr))
predict = paddle.layer.crf_decoding(
size=label_dict_len,
input=output,
param_attr=paddle.attr.Param(name='crfw'))
crf_dec = paddle.layer.crf_decoding(
size=label_dict_len,
input=output,
label=target,
param_attr=paddle.attr.Param(name='crfw'))
return output, target, crf_cost, predict
return crf_cost, crf_dec, target
else:
predict = paddle.layer.crf_decoding(
size=label_dict_len,
input=output,
param_attr=paddle.attr.Param(name='crfw'))
return predict
def ner_net_train(data_reader=conll03.train(), num_passes=1):
def ner_net_train(data_reader, num_passes=1):
# define network topology
feature_out, target, crf_cost, predict = ner_net()
crf_dec = paddle.layer.crf_decoding(
size=label_dict_len,
input=feature_out,
label=target,
param_attr=paddle.attr.Param(name='crfw'))
evaluator.sum(input=crf_dec)
crf_cost, crf_dec, target = ner_net(is_train=True)
evaluator.sum(name='error', input=crf_dec)
# create parameters
parameters = paddle.parameters.create(crf_cost)
parameters.set('emb', conll03.get_embedding())
parameters.set('emb', word_vector_values)
# create optimizer
optimizer = paddle.optimizer.Momentum(
......@@ -179,7 +183,7 @@ def ner_net_train(data_reader=conll03.train(), num_passes=1):
extra_layers=crf_dec)
reader = paddle.batch(
paddle.reader.shuffle(data_reader, buf_size=8192), batch_size=256)
paddle.reader.shuffle(data_reader, buf_size=8192), batch_size=64)
feeding = {'word': 0, 'target': 1}
......@@ -210,9 +214,7 @@ def ner_net_train(data_reader=conll03.train(), num_passes=1):
return parameters
def ner_net_infer(parameters=paddle.parameters.Parameters.from_tar(
gzip.open('ner_params_pass_99.tar.gz')),
data_reader=conll03.test()):
def ner_net_infer(data_reader, parameters):
test_creator = data_reader
test_data = []
for item in test_creator():
......@@ -220,7 +222,7 @@ def ner_net_infer(parameters=paddle.parameters.Parameters.from_tar(
if len(test_data) == 10:
break
feature_out, target, crf_cost, predict = ner_net()
predict = ner_net(is_train=False)
lab_ids = paddle.infer(
output_layer=predict,
......@@ -237,5 +239,5 @@ def ner_net_infer(parameters=paddle.parameters.Parameters.from_tar(
if __name__ == '__main__':
paddle.init(use_gpu=False, trainer_count=1)
ner_net_train()
ner_net_infer()
parameters = ner_net_train(train_data_reader, 1)
ner_net_infer(test_data_reader, parameters)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册