未验证 提交 53bd51e3 编写于 作者: Q Qiao Longfei 提交者: GitHub

07/Label semantic roles (#5798)

* init label_semantic_roles.py

* add linear_chain_crf and test

* complete test_linear_chain_crf

* correct last layer of db_lstm

* update optimizer and initializer

* update param_initializer of embedding_layer

* support load pre trained embedding

* rm unused parameter

* optimize code

* clean code

* fix test

* add todo
上级 778b981e
...@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
ll -= std::log(sum); ll -= std::log(sum);
// Now ll is equal to -log(Z). // Now ll is equal to -log(Z).
const int* lbl = label.data<int>(); const int64_t* lbl = label.data<int64_t>();
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num, static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
"An invalid tag label that execesses the largest tag number."); "An invalid tag label that execesses the largest tag number.");
...@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
Tensor* emission_grad) const { Tensor* emission_grad) const {
const T* w_exps = transition_exps.data<T>(); const T* w_exps = transition_exps.data<T>();
const T* x_exps = emission_exps.data<T>(); const T* x_exps = emission_exps.data<T>();
const int* label_value = label.data<int>(); const int64_t* label_value = label.data<int64_t>();
T* beta_value = beta->data<T>(); T* beta_value = beta->data<T>();
auto x_dims = emission_exps.dims(); auto x_dims = emission_exps.dims();
......
...@@ -126,7 +126,10 @@ class LayerHelper(object): ...@@ -126,7 +126,10 @@ class LayerHelper(object):
self.startup_program.global_block().create_parameter( self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr_copy) dtype=dtype, shape=shape, **attr_copy)
return self.main_program.global_block().create_parameter( return self.main_program.global_block().create_parameter(
name=attr_copy['name'], dtype=dtype, shape=shape) name=attr_copy['name'],
dtype=dtype,
shape=shape,
trainable=attr_copy.get('trainable', True))
def create_tmp_variable(self, dtype): def create_tmp_variable(self, dtype):
return self.main_program.current_block().create_var( return self.main_program.current_block().create_var(
......
...@@ -112,6 +112,7 @@ def fc(input, ...@@ -112,6 +112,7 @@ def fc(input,
def embedding(input, def embedding(input,
size, size,
is_sparse=False, is_sparse=False,
param_initializer=None,
param_attr=None, param_attr=None,
data_type='float32', data_type='float32',
main_program=None, main_program=None,
...@@ -136,9 +137,16 @@ def embedding(input, ...@@ -136,9 +137,16 @@ def embedding(input,
to the LayerHelper constructor. to the LayerHelper constructor.
""" """
def _get_default_param_initializer():
return XavierInitializer()
helper = LayerHelper('embedding', **locals()) helper = LayerHelper('embedding', **locals())
w = helper.create_parameter( w = helper.create_parameter(
attr=helper.param_attr, shape=size, dtype=data_type) attr=helper.param_attr,
shape=size,
dtype=data_type,
initializer=param_initializer or _get_default_param_initializer())
tmp = helper.create_tmp_variable(data_type) tmp = helper.create_tmp_variable(data_type)
helper.append_op( helper.append_op(
type='lookup_table', type='lookup_table',
...@@ -460,6 +468,41 @@ def sums(input, main_program=None, startup_program=None): ...@@ -460,6 +468,41 @@ def sums(input, main_program=None, startup_program=None):
return out return out
def linear_chain_crf(input,
label,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
def _get_default_param_initializer():
return XavierInitializer()
helper = LayerHelper('linear_chain_crf', **locals())
size = input.shape[1]
transition = helper.create_parameter(
attr=helper.param_attr,
shape=[size + 2, size],
dtype=helper.input_dtype(),
initializer=param_initializer or _get_default_param_initializer())
alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='linear_chain_crf',
inputs={"Emission": [input],
"Transition": transition,
"Label": label},
outputs={
"Alpha": [alpha],
"EmissionExps": [emission_exps],
"TransitionExps": transition_exps,
"LogLikelihood": log_likelihood
})
return log_likelihood
def assign(input, output, main_program=None, startup_program=None): def assign(input, output, main_program=None, startup_program=None):
helper = LayerHelper('assign', **locals()) helper = LayerHelper('assign', **locals())
helper.append_op( helper.append_op(
......
...@@ -170,7 +170,8 @@ class Optimizer(object): ...@@ -170,7 +170,8 @@ class Optimizer(object):
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
if param_and_grad[1] is not None: if param_and_grad[0].trainable is True and param_and_grad[
1] is not None:
optimize_op = self._append_optimize_op(loss.block, optimize_op = self._append_optimize_op(loss.block,
param_and_grad) param_and_grad)
optimize_ops.append(optimize_op) optimize_ops.append(optimize_op)
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor, g_scope
from paddle.v2.fluid.optimizer import SGDOptimizer
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
IS_SPARSE = True
PASS_NUM = 10
BATCH_SIZE = 20
embedding_name = 'emb'
def load_parameter(file_name, h, w):
with open(file_name, 'rb') as f:
f.read(16) # skip header.
return np.fromfile(f, dtype=np.float32).reshape(h, w)
def db_lstm():
# 8 features
word = layers.data(name='word_data', shape=[1], data_type='int64')
predicate = layers.data(name='verb_data', shape=[1], data_type='int64')
ctx_n2 = layers.data(name='ctx_n2_data', shape=[1], data_type='int64')
ctx_n1 = layers.data(name='ctx_n1_data', shape=[1], data_type='int64')
ctx_0 = layers.data(name='ctx_0_data', shape=[1], data_type='int64')
ctx_p1 = layers.data(name='ctx_p1_data', shape=[1], data_type='int64')
ctx_p2 = layers.data(name='ctx_p2_data', shape=[1], data_type='int64')
mark = layers.data(name='mark_data', shape=[1], data_type='int64')
predicate_embedding = layers.embedding(
input=predicate,
size=[pred_len, word_dim],
data_type='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'vemb'})
mark_embedding = layers.embedding(
input=mark,
size=[mark_dict_len, mark_dim],
data_type='float32',
is_sparse=IS_SPARSE)
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
layers.embedding(
size=[word_dict_len, word_dim],
input=x,
param_attr={'name': embedding_name,
'trainable': False}) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
]
hidden_0 = layers.sums(input=hidden_0_layers)
lstm_0 = layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = layers.sums(input=[
layers.fc(input=input_tmp[0], size=hidden_dim),
layers.fc(input=input_tmp[1], size=hidden_dim)
])
lstm = layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = layers.sums(input=[
layers.fc(input=input_tmp[0], size=label_dict_len),
layers.fc(input=input_tmp[1], size=label_dict_len)
])
return feature_out
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
# define network topology
feature_out = db_lstm()
target = layers.data(name='target', shape=[1], data_type='int64')
crf_cost = layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr={"name": 'crfw',
"learning_rate": mix_hidden_lr})
avg_cost = layers.mean(x=crf_cost)
# TODO(qiao)
# 1. add crf_decode_layer and evaluator
# 2. use other optimizer and check why out will be NAN
sgd_optimizer = SGDOptimizer(learning_rate=0.0001)
opts = sgd_optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE)
place = core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
embedding_param = g_scope.find_var(embedding_name).get_tensor()
embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
batch_id = 0
for pass_id in xrange(PASS_NUM):
for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
verb_data = to_lodtensor(map(lambda x: x[6], data), place)
mark_data = to_lodtensor(map(lambda x: x[7], data), place)
target = to_lodtensor(map(lambda x: x[8], data), place)
outs = exe.run(framework.default_main_program(),
feed={
'word_data': word_data,
'ctx_n2_data': ctx_n2_data,
'ctx_n1_data': ctx_n1_data,
'ctx_0_data': ctx_0_data,
'ctx_p1_data': ctx_p1_data,
'ctx_p2_data': ctx_p2_data,
'verb_data': verb_data,
'mark_data': mark_data,
'target': target
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
if batch_id % 10 == 0:
print("avg_cost=" + str(avg_cost_val))
# exit early for CI
exit(0)
batch_id = batch_id + 1
if __name__ == '__main__':
main()
import unittest
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program from paddle.v2.fluid.framework import Program
import paddle.v2.fluid.core as core
import unittest
class TestBook(unittest.TestCase): class TestBook(unittest.TestCase):
...@@ -20,7 +20,8 @@ class TestBook(unittest.TestCase): ...@@ -20,7 +20,8 @@ class TestBook(unittest.TestCase):
avg_cost = layers.mean(x=cost, main_program=program) avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
program.append_backward(avg_cost) program.append_backward(avg_cost)
print str(program)
# print str(program)
def test_recognize_digits_mlp(self): def test_recognize_digits_mlp(self):
program = Program() program = Program()
...@@ -49,7 +50,7 @@ class TestBook(unittest.TestCase): ...@@ -49,7 +50,7 @@ class TestBook(unittest.TestCase):
input=predict, label=label, main_program=program) input=predict, label=label, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program) avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
print str(program) # print str(program)
def test_simple_conv2d(self): def test_simple_conv2d(self):
program = Program() program = Program()
...@@ -64,7 +65,7 @@ class TestBook(unittest.TestCase): ...@@ -64,7 +65,7 @@ class TestBook(unittest.TestCase):
filter_size=[4, 4], filter_size=[4, 4],
main_program=program) main_program=program)
print str(program) # print str(program)
def test_recognize_digits_conv(self): def test_recognize_digits_conv(self):
program = Program() program = Program()
...@@ -103,7 +104,7 @@ class TestBook(unittest.TestCase): ...@@ -103,7 +104,7 @@ class TestBook(unittest.TestCase):
program.append_backward(avg_cost) program.append_backward(avg_cost)
print str(program) # print str(program)
def test_word_embedding(self): def test_word_embedding(self):
program = Program() program = Program()
...@@ -164,7 +165,24 @@ class TestBook(unittest.TestCase): ...@@ -164,7 +165,24 @@ class TestBook(unittest.TestCase):
avg_cost = layers.mean(x=cost, main_program=program) avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost) self.assertIsNotNone(avg_cost)
print str(program) # print str(program)
def test_linear_chain_crf(self):
program = Program()
# Change g_program, so the rest layers use `g_program`
images = layers.data(
name='pixel',
shape=[784],
data_type='float32',
main_program=program)
label = layers.data(
name='label', shape=[1], data_type='int32', main_program=program)
hidden = layers.fc(input=images, size=128, main_program=program)
crf = layers.linear_chain_crf(
input=hidden, label=label, main_program=program)
# print str(program)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest): ...@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest):
transition_exps = np.exp(transition) transition_exps = np.exp(transition)
labels = np.random.randint( labels = np.random.randint(
low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32") low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
self.inputs = { self.inputs = {
"Emission": (emission, lod), "Emission": (emission, lod),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册