提交 132a26af 编写于 作者: C caoying03

refine NER.

上级 f27154e7
wget http://cs224d.stanford.edu/assignment2/assignment2.zip
unzip assignment2.zip
cp assignment2_release/data/ner/wordVectors.txt data/
cp assignment2_release/data/ner/vocab.txt data/
cp assignment2_release/data/ner/wordVectors.txt ./
cp assignment2_release/data/ner/vocab.txt ./
rm -rf assignment2.zip assignment2_release
-DOCSTART- -X- O O
CRICKET NNP I-NP O
- : O O
LEICESTERSHIRE NNP I-NP I-ORG
......
-DOCSTART- -X- O O
EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
......
因为 它太大了无法显示 source diff 。你可以改为 查看blob
import gzip
import reader
from network_conf import *
from utils import *
def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
def _infer_a_batch(inferer, test_data, id_2_word, id_2_label):
probs = inferer.infer(input=test_data, field=["id"])
assert len(probs) == sum(len(x[0]) for x in test_data)
for idx, test_sample in enumerate(test_data):
start_id = 0
pred_str = ""
for w, tag in zip(test_sample[0],
probs[start_id:start_id + len(test_sample[0])]):
pred_str += "%s[%s] " % (id_2_word[w], id_2_label[tag])
print(pred_str.strip())
start_id += len(test_sample[0])
word_dict = load_dict(vocab_file)
word_dict_len = len(word_dict)
word_reverse_dict = load_reverse_dict(vocab_file)
label_dict = load_dict(target_file)
label_reverse_dict = load_reverse_dict(target_file)
label_dict_len = len(label_dict)
# initialize PaddlePaddle
paddle.init(use_gpu=False, trainer_count=1)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(model_path, "r"))
predict = ner_net(
word_dict_len=word_dict_len,
label_dict_len=label_dict_len,
is_train=False)
inferer = paddle.inference.Inference(
output_layer=predict, parameters=parameters)
test_data = []
for i, item in enumerate(
reader.data_reader(test_data_file, word_dict, label_dict)()):
test_data.append([item[0], item[1]])
if len(test_data) == batch_size:
_infer_a_batch(inferer, test_data, word_reverse_dict,
label_reverse_dict)
test_data = []
_infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict)
test_data = []
if __name__ == "__main__":
infer(
model_path="models/params_pass_0.tar.gz",
batch_size=2,
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt")
import math
import gzip
import paddle.v2 as paddle
import paddle.v2.evaluator as evaluator
import conll03
import itertools
# init dataset
train_data_file = 'data/train'
test_data_file = 'data/test'
vocab_file = 'data/vocab.txt'
target_file = 'data/target.txt'
emb_file = 'data/wordVectors.txt'
train_data_reader = conll03.train(train_data_file, vocab_file, target_file)
test_data_reader = conll03.test(test_data_file, vocab_file, target_file)
word_dict, label_dict = conll03.get_dict(vocab_file, target_file)
word_vector_values = conll03.get_embedding(emb_file)
# init hyper-params
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
mark_dict_len = 2
word_dim = 50
mark_dim = 5
hidden_dim = 300
mix_hidden_lr = 1e-3
default_std = 1 / math.sqrt(hidden_dim) / 3.0
emb_para = paddle.attr.Param(
name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True)
std_0 = paddle.attr.Param(initial_std=0.)
std_default = paddle.attr.Param(initial_std=default_std)
def d_type(size):
return paddle.data_type.integer_value_sequence(size)
def ner_net(is_train):
word = paddle.layer.data(name='word', type=d_type(word_dict_len))
mark = paddle.layer.data(name='mark', type=d_type(mark_dict_len))
word_embedding = paddle.layer.mixed(
name='word_embedding',
size=word_dim,
input=paddle.layer.table_projection(input=word, param_attr=emb_para))
mark_embedding = paddle.layer.mixed(
name='mark_embedding',
size=mark_dim,
input=paddle.layer.table_projection(input=mark, param_attr=std_0))
emb_layers = [word_embedding, mark_embedding]
word_caps_vector = paddle.layer.concat(
name='word_caps_vector', input=emb_layers)
hidden_1 = paddle.layer.mixed(
name='hidden1',
size=hidden_dim,
act=paddle.activation.Tanh(),
bias_attr=std_default,
input=[
paddle.layer.full_matrix_projection(
input=word_caps_vector, param_attr=std_default)
])
rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
hidden_para_attr = paddle.attr.Param(
initial_std=default_std, learning_rate=mix_hidden_lr)
rnn_1_1 = paddle.layer.recurrent(
name='rnn1-1',
input=hidden_1,
act=paddle.activation.Relu(),
bias_attr=std_0,
param_attr=rnn_para_attr)
rnn_1_2 = paddle.layer.recurrent(
name='rnn1-2',
input=hidden_1,
act=paddle.activation.Relu(),
reverse=1,
bias_attr=std_0,
param_attr=rnn_para_attr)
hidden_2_1 = paddle.layer.mixed(
name='hidden2-1',
size=hidden_dim,
bias_attr=std_default,
act=paddle.activation.STanh(),
input=[
paddle.layer.full_matrix_projection(
input=hidden_1, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=rnn_1_1, param_attr=rnn_para_attr)
])
hidden_2_2 = paddle.layer.mixed(
name='hidden2-2',
size=hidden_dim,
bias_attr=std_default,
act=paddle.activation.STanh(),
input=[
paddle.layer.full_matrix_projection(
input=hidden_1, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=rnn_1_2, param_attr=rnn_para_attr)
])
rnn_2_1 = paddle.layer.recurrent(
name='rnn2-1',
input=hidden_2_1,
act=paddle.activation.Relu(),
reverse=1,
bias_attr=std_0,
param_attr=rnn_para_attr)
rnn_2_2 = paddle.layer.recurrent(
name='rnn2-2',
input=hidden_2_2,
act=paddle.activation.Relu(),
bias_attr=std_0,
param_attr=rnn_para_attr)
hidden_3 = paddle.layer.mixed(
name='hidden3',
size=hidden_dim,
bias_attr=std_default,
act=paddle.activation.STanh(),
input=[
paddle.layer.full_matrix_projection(
input=hidden_2_1, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=rnn_2_1,
param_attr=rnn_para_attr), paddle.layer.full_matrix_projection(
input=hidden_2_2, param_attr=hidden_para_attr),
paddle.layer.full_matrix_projection(
input=rnn_2_2, param_attr=rnn_para_attr)
])
output = paddle.layer.mixed(
name='output',
size=label_dict_len,
bias_attr=False,
input=[
paddle.layer.full_matrix_projection(
input=hidden_3, param_attr=std_default)
])
if is_train:
target = paddle.layer.data(name='target', type=d_type(label_dict_len))
crf_cost = paddle.layer.crf(
size=label_dict_len,
input=output,
label=target,
param_attr=paddle.attr.Param(
name='crfw',
initial_std=default_std,
learning_rate=mix_hidden_lr))
crf_dec = paddle.layer.crf_decoding(
size=label_dict_len,
input=output,
label=target,
param_attr=paddle.attr.Param(name='crfw'))
return crf_cost, crf_dec, target
else:
predict = paddle.layer.crf_decoding(
size=label_dict_len,
input=output,
param_attr=paddle.attr.Param(name='crfw'))
return predict
def ner_net_train(data_reader=train_data_reader, num_passes=1):
# define network topology
crf_cost, crf_dec, target = ner_net(is_train=True)
evaluator.sum(name='error', input=crf_dec)
evaluator.chunk(
name='ner_chunk',
input=crf_dec,
label=target,
chunk_scheme='IOB',
num_chunk_types=(label_dict_len - 1) / 2)
# create parameters
parameters = paddle.parameters.create(crf_cost)
parameters.set('emb', word_vector_values)
# create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-4,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
gradient_clipping_threshold=25,
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
trainer = paddle.trainer.SGD(
cost=crf_cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=crf_dec)
reader = paddle.batch(
paddle.reader.shuffle(data_reader, buf_size=8192), batch_size=64)
feeding = {'word': 0, 'mark': 1, 'target': 2}
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
if event.batch_id % 1000 == 0:
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, Batch %d, %s" % (
event.pass_id, event.batch_id, result.metrics)
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
parameters.to_tar(f)
result = trainer.test(reader=reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
trainer.train(
reader=reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding)
return parameters
def ner_net_infer(data_reader=test_data_reader, model_file='ner_model.tar.gz'):
test_data = []
test_sentences = []
for item in data_reader():
test_data.append([item[0], item[1]])
test_sentences.append(item[-1])
if len(test_data) == 10:
break
predict = ner_net(is_train=False)
lab_ids = paddle.infer(
output_layer=predict,
parameters=paddle.parameters.Parameters.from_tar(gzip.open(model_file)),
input=test_data,
field='id')
flat_data = [word for word in itertools.chain.from_iterable(test_sentences)]
labels_reverse = {}
for (k, v) in label_dict.items():
labels_reverse[v] = k
pre_lab = [labels_reverse[lab_id] for lab_id in lab_ids]
for word, label in zip(flat_data, pre_lab):
print word, label
if __name__ == '__main__':
paddle.init(use_gpu=False, trainer_count=1)
ner_net_train(data_reader=train_data_reader, num_passes=1)
ner_net_infer(
data_reader=test_data_reader, model_file='params_pass_0.tar.gz')
import math
import paddle.v2 as paddle
import paddle.v2.evaluator as evaluator
def stacked_rnn(input_layer,
hidden_size,
hidden_para_attr,
rnn_para_attr,
stack_num=3,
reverse=False):
for i in range(stack_num):
hidden = paddle.layer.fc(
size=hidden_size,
act=paddle.activation.Tanh(),
bias_attr=paddle.attr.Param(initial_std=1.),
input=[input_layer] if not i else [hidden, rnn],
param_attr=[rnn_para_attr]
if not i else [hidden_para_attr, rnn_para_attr])
rnn = paddle.layer.recurrent(
input=hidden,
act=paddle.activation.Relu(),
bias_attr=paddle.attr.Param(initial_std=1.),
reverse=reverse,
param_attr=rnn_para_attr)
return hidden, rnn
def ner_net(word_dict_len, label_dict_len, stack_num=3, is_train=True):
mark_dict_len = 2
word_dim = 50
mark_dim = 5
hidden_dim = 128
word = paddle.layer.data(
name='word',
type=paddle.data_type.integer_value_sequence(word_dict_len))
word_embedding = paddle.layer.embedding(
input=word,
size=word_dim,
param_attr=paddle.attr.Param(
name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True))
mark = paddle.layer.data(
name='mark',
type=paddle.data_type.integer_value_sequence(mark_dict_len))
mark_embedding = paddle.layer.embedding(
input=mark,
size=mark_dim,
param_attr=paddle.attr.Param(initial_std=math.sqrt(1. / word_dim)))
emb_layers = [word_embedding, mark_embedding]
word_caps_vector = paddle.layer.concat(input=emb_layers)
mix_hidden_lr = 1e-3
rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
hidden_para_attr = paddle.attr.Param(
initial_std=1 / math.sqrt(hidden_dim), learning_rate=mix_hidden_lr)
forward_hidden, rnn_forward = stacked_rnn(word_caps_vector, hidden_dim,
hidden_para_attr, rnn_para_attr)
backward_hidden, rnn_backward = stacked_rnn(
word_caps_vector,
hidden_dim,
hidden_para_attr,
rnn_para_attr,
reverse=True)
fea = paddle.layer.fc(
size=hidden_dim,
bias_attr=paddle.attr.Param(initial_std=1.),
act=paddle.activation.STanh(),
input=[forward_hidden, rnn_forward, backward_hidden, rnn_backward],
param_attr=[
hidden_para_attr, rnn_para_attr, hidden_para_attr, rnn_para_attr
])
emission = paddle.layer.fc(
size=label_dict_len,
bias_attr=False,
input=fea,
param_attr=rnn_para_attr)
if is_train:
target = paddle.layer.data(
name='target',
type=paddle.data_type.integer_value_sequence(label_dict_len))
crf = paddle.layer.crf(
size=label_dict_len,
input=emission,
label=target,
param_attr=paddle.attr.Param(name='crfw', initial_std=1e-3))
crf_dec = paddle.layer.crf_decoding(
size=label_dict_len,
input=emission,
label=target,
param_attr=paddle.attr.Param(name='crfw'))
return crf, crf_dec, target
else:
predict = paddle.layer.crf_decoding(
size=label_dict_len,
input=emission,
param_attr=paddle.attr.Param(name='crfw'))
return predict
......@@ -2,16 +2,9 @@
Conll03 dataset.
"""
import tarfile
import gzip
import itertools
import collections
import re
import numpy as np
from utils import *
__all__ = ['train', 'test', 'get_dict', 'get_embedding']
UNK_IDX = 0
__all__ = ["data_reader"]
def canonicalize_digits(word):
......@@ -28,96 +21,48 @@ def canonicalize_word(word, wordset=None, digits=True):
if (wordset != None) and (word in wordset): return word
word = canonicalize_digits(word) # try to canonicalize numbers
if (wordset == None) or (word in wordset): return word
else: return "UUUNKKK" # unknown token
else: return "<UNK>" # unknown token
def load_dict(filename):
d = dict()
with open(filename, 'r') as f:
for i, line in enumerate(f):
d[line.strip()] = i
return d
def get_dict(vocab_file='data/vocab.txt', target_file='data/target.txt'):
"""
Get the word and label dictionary.
def data_reader(data_file, word_dict, label_dict):
"""
word_dict = load_dict(vocab_file)
label_dict = load_dict(target_file)
return word_dict, label_dict
Conll03 train set creator.
The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
It returns a reader creator, each sample in the reader includes:
word id sequence, label id sequence and raw sentence.
def get_embedding(emb_file='data/wordVectors.txt'):
"""
Get the trained word vector.
:return: reader creator
:rtype: callable
"""
return np.loadtxt(emb_file, dtype=float)
def corpus_reader(filename='data/train'):
def reader():
UNK_IDX = word_dict["<UNK>"]
sentence = []
labels = []
with open(filename) as f:
with open(data_file, "r") as f:
for line in f:
if re.match(r"-DOCSTART-.+", line) or (len(line.strip()) == 0):
if len(line.strip()) == 0:
if len(sentence) > 0:
yield sentence, labels
word_idx = [
word_dict.get(
canonicalize_word(w, word_dict), UNK_IDX)
for w in sentence
]
mark = [1 if w[0].isupper() else 0 for w in sentence]
label_idx = [label_dict[l] for l in labels]
yield word_idx, mark, label_idx
sentence = []
labels = []
else:
segs = line.strip().split()
sentence.append(segs[0])
# transform from I-TYPE to BIO schema
if segs[-1] != 'O' and (len(labels) == 0 or
# transform I-TYPE to BIO schema
if segs[-1] != "O" and (len(labels) == 0 or
labels[-1][1:] != segs[-1][1:]):
labels.append('B' + segs[-1][1:])
labels.append("B" + segs[-1][1:])
else:
labels.append(segs[-1])
f.close()
return reader
def reader_creator(corpus_reader, word_dict, label_dict):
"""
Conll03 train set creator.
The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
It returns a reader creator, each sample in the reader includes word id sequence, label id sequence and raw sentence for purpose of print.
:return: Training reader creator
:rtype: callable
"""
def reader():
for sentence, labels in corpus_reader():
word_idx = [
word_dict.get(canonicalize_word(w, word_dict), UNK_IDX)
for w in sentence
]
mark = [1 if w[0].isupper() else 0 for w in sentence]
label_idx = [label_dict.get(w) for w in labels]
yield word_idx, mark, label_idx, sentence
return reader
def train(data_file='data/train',
vocab_file='data/vocab.txt',
target_file='data/target.txt'):
return reader_creator(
corpus_reader(data_file),
word_dict=load_dict(vocab_file),
label_dict=load_dict(target_file))
def test(data_file='data/test',
vocab_file='data/vocab.txt',
target_file='data/target.txt'):
return reader_creator(
corpus_reader(data_file),
word_dict=load_dict(vocab_file),
label_dict=load_dict(target_file))
import gzip
import numpy as np
import reader
from utils import *
from network_conf import *
def main(train_data_file,
test_data_file,
vocab_file,
target_file,
emb_file,
num_passes=10,
batch_size=32):
word_dict = load_dict(vocab_file)
label_dict = load_dict(target_file)
word_vector_values = get_embedding(emb_file)
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
paddle.init(use_gpu=False, trainer_count=1)
# define network topology
crf_cost, crf_dec, target = ner_net(word_dict_len, label_dict_len)
evaluator.sum(name="error", input=crf_dec)
evaluator.chunk(
name="ner_chunk",
input=crf_dec,
label=target,
chunk_scheme="IOB",
num_chunk_types=(label_dict_len - 1) / 2)
# create parameters
parameters = paddle.parameters.create(crf_cost)
parameters.set("emb", word_vector_values)
# create optimizer
optimizer = paddle.optimizer.Momentum(
momentum=0,
learning_rate=2e-4,
regularization=paddle.optimizer.L2Regularization(rate=8e-4),
gradient_clipping_threshold=25,
model_average=paddle.optimizer.ModelAverage(
average_window=0.5, max_average_window=10000), )
trainer = paddle.trainer.SGD(
cost=crf_cost,
parameters=parameters,
update_equation=optimizer,
extra_layers=crf_dec)
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.data_reader(train_data_file, word_dict, label_dict),
buf_size=1000),
batch_size=batch_size)
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.data_reader(test_data_file, word_dict, label_dict),
buf_size=1000),
batch_size=batch_size)
feeding = {"word": 0, "mark": 1, "target": 2}
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 1 == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
if event.batch_id % 1 == 0:
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, Batch %d, %s" %
(event.pass_id, event.batch_id, result.metrics))
if isinstance(event, paddle.event.EndPass):
# save parameters
with gzip.open("models/params_pass_%d.tar.gz" % event.pass_id,
"w") as f:
parameters.to_tar(f)
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, %s" % (event.pass_id,
result.metrics))
trainer.train(
reader=train_reader,
event_handler=event_handler,
num_passes=num_passes,
feeding=feeding)
if __name__ == "__main__":
main(
train_data_file='data/train',
test_data_file='data/test',
vocab_file='data/vocab.txt',
target_file='data/target.txt',
emb_file='data/wordVectors.txt')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import re
import argparse
import numpy as np
from collections import defaultdict
logger = logging.getLogger("logger")
logger.setLevel(logging.INFO)
def get_embedding(emb_file='data/wordVectors.txt'):
"""
Get the trained word vector.
"""
return np.loadtxt(emb_file, dtype=float)
def load_dict(dict_path):
return dict((line.strip().split("\t")[0], idx)
for idx, line in enumerate(open(dict_path, "r").readlines()))
def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册