提交 8ba8237a 编写于 作者: S sweetsky0901

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into my_unpool_max_2d

......@@ -114,18 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(sigmoid)
.InEnum({identity, sigmoid, tanh, relu});
AddComment(R"DOC(
GRUUnit Operator.
This operator implements partial calculations of the GRU unit as follows:
GRUUnit Operator implements partial calculations of the GRU unit as following:
$$
update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r) \\
output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\
output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
$$
The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
which is same as one time step of GRU Operator.
@note To implement the complete GRU unit, fully-connected operator must be
used before to feed xu, xr and xc as the Input of GRUUnit operator.
)DOC");
}
......@@ -150,12 +151,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
"ResetHiddenPrev");
PADDLE_ENFORCE(ctx->HasInput("Hidden"),
"Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
"Input(%s@GRAD) of GRUUnitGradOp should not be null.",
"Gate");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
"Input(%s@GRAD) of GRUUnitGradOp should not be null.",
"ResetHiddenPrev");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
"Input(%s@GRAD) of GRUUnitGradOp should not be null.",
"Hidden");
......
......@@ -110,7 +110,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
auto c = g.slice(c_offsets, extents); // output candidate
// calculate final output
h.device(place) = u * (h_p - c) + c;
h.device(place) = u * (c - h_p) + h_p;
}
};
......@@ -146,35 +146,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
auto* weight_grad =
context.Output<Tensor>(framework::GradVarName("Weight"));
auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
input_grad->mutable_data<T>(context.GetPlace());
hidden_prev_grad->mutable_data<T>(context.GetPlace());
weight_grad->mutable_data<T>(context.GetPlace());
Tensor gate_grad;
gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
Tensor reset_hidden_prev_grad;
reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
context.GetPlace());
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
const T* hidden_prev_data = hidden_prev->data<T>();
T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
const T* weight_data = weight->data<T>();
T* weight_grad_data = weight_grad->data<T>();
T* gate_grad_data = gate_grad.data<T>();
T* gate_grad_data =
gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
reset_hidden_prev->dims(), context.GetPlace());
auto h_p = EigenMatrix<T>::From(*hidden_prev);
auto g = EigenMatrix<T>::From(*gate);
auto d_h = EigenMatrix<T>::From(*hidden_grad);
auto d_x = EigenMatrix<T>::From(*input_grad);
auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
auto d_g = EigenMatrix<T>::From(gate_grad);
auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
auto place = context.GetEigenDevice<Place>();
int batch_size = input->dims()[0];
int frame_size = hidden_prev->dims()[1];
Eigen::array<int, 2> extents({{batch_size, frame_size}});
Eigen::array<int, 2> u_offsets({{0, 0}});
auto u = g.slice(u_offsets, extents); // update gate
......@@ -185,38 +177,52 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
// backward for unactivated update gate
ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
d_g.slice(u_offsets, extents), d_h * (h_p - c));
d_g.slice(u_offsets, extents), d_h * (c - h_p));
// backward for unactivated output candidate
ActGradCompute(context.Attr<int>("activation"), place, c, c,
d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
d_g.slice(c_offsets, extents), d_h * u);
// backward for reset_hidden_prev
math::gemm<Place, T>(context.device_context(), false, true, batch_size,
frame_size, frame_size, 1,
gate_grad_data + frame_size * 2, frame_size * 3,
weight_data + frame_size * frame_size * 2, frame_size,
0, reset_hidden_prev_grad_data, frame_size);
// backward for unactivated reset gate
ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
d_g.slice(r_offsets, extents), d_r_h_p * h_p);
// backward for weight
if (weight_grad) {
T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
// backward for state_weight
math::gemm<Place, T>(
context.device_context(), true, false, frame_size, frame_size,
batch_size, 1, reset_hidden_prev_data, frame_size,
gate_grad_data + frame_size * 2, frame_size * 3, 0,
weight_grad_data + frame_size * frame_size * 2, frame_size);
// backward for unactivated reset gate
ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
d_g.slice(r_offsets, extents), d_r_h_p * h_p);
// backward for update_gate_weight and reset_gate_weight
math::gemm<Place, T>(context.device_context(), true, false, frame_size,
frame_size * 2, batch_size, 1, hidden_prev_data,
frame_size, gate_grad_data, frame_size * 3, 0,
weight_grad_data, frame_size * 2);
}
// backward for hidden_prev
d_h_p.device(place) = d_r_h_p * r + d_h * u;
if (hidden_prev_grad) {
T* hidden_prev_grad_data =
hidden_prev_grad->mutable_data<T>(context.GetPlace());
auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
math::gemm<Place, T>(context.device_context(), false, true, batch_size,
frame_size, frame_size * 2, 1, gate_grad_data,
frame_size * 3, weight_data, frame_size * 2, 1,
hidden_prev_grad_data, frame_size);
}
// backward for input
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
auto d_x = EigenMatrix<T>::From(*input_grad);
d_x.device(place) = d_g;
}
// backward for bias
if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace());
......
......@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
ll -= std::log(sum);
// Now ll is equal to -log(Z).
const int* lbl = label.data<int>();
const int64_t* lbl = label.data<int64_t>();
PADDLE_ENFORCE_LT(
static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
"An invalid tag label that execesses the largest tag number.");
......@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
Tensor* emission_grad) const {
const T* w_exps = transition_exps.data<T>();
const T* x_exps = emission_exps.data<T>();
const int* label_value = label.data<int>();
const int64_t* label_value = label.data<int64_t>();
T* beta_value = beta->data<T>();
auto x_dims = emission_exps.dims();
......
......@@ -126,7 +126,10 @@ class LayerHelper(object):
self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr_copy)
return self.main_program.global_block().create_parameter(
name=attr_copy['name'], dtype=dtype, shape=shape)
name=attr_copy['name'],
dtype=dtype,
shape=shape,
trainable=attr_copy.get('trainable', True))
def create_tmp_variable(self, dtype):
return self.main_program.current_block().create_var(
......
......@@ -112,6 +112,7 @@ def fc(input,
def embedding(input,
size,
is_sparse=False,
param_initializer=None,
param_attr=None,
data_type='float32',
main_program=None,
......@@ -136,9 +137,16 @@ def embedding(input,
to the LayerHelper constructor.
"""
def _get_default_param_initializer():
return XavierInitializer()
helper = LayerHelper('embedding', **locals())
w = helper.create_parameter(
attr=helper.param_attr, shape=size, dtype=data_type)
attr=helper.param_attr,
shape=size,
dtype=data_type,
initializer=param_initializer or _get_default_param_initializer())
tmp = helper.create_tmp_variable(data_type)
helper.append_op(
type='lookup_table',
......@@ -460,6 +468,41 @@ def sums(input, main_program=None, startup_program=None):
return out
def linear_chain_crf(input,
label,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
def _get_default_param_initializer():
return XavierInitializer()
helper = LayerHelper('linear_chain_crf', **locals())
size = input.shape[1]
transition = helper.create_parameter(
attr=helper.param_attr,
shape=[size + 2, size],
dtype=helper.input_dtype(),
initializer=param_initializer or _get_default_param_initializer())
alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='linear_chain_crf',
inputs={"Emission": [input],
"Transition": transition,
"Label": label},
outputs={
"Alpha": [alpha],
"EmissionExps": [emission_exps],
"TransitionExps": transition_exps,
"LogLikelihood": log_likelihood
})
return log_likelihood
def assign(input, output, main_program=None, startup_program=None):
helper = LayerHelper('assign', **locals())
helper.append_op(
......
......@@ -170,7 +170,8 @@ class Optimizer(object):
optimize_ops = []
for param_and_grad in parameters_and_grads:
if param_and_grad[1] is not None:
if param_and_grad[0].trainable is True and param_and_grad[
1] is not None:
optimize_op = self._append_optimize_op(loss.block,
param_and_grad)
optimize_ops.append(optimize_op)
......
import numpy as np
import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor, g_scope
from paddle.v2.fluid.optimizer import SGDOptimizer
word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
IS_SPARSE = True
PASS_NUM = 10
BATCH_SIZE = 20
embedding_name = 'emb'
def load_parameter(file_name, h, w):
with open(file_name, 'rb') as f:
f.read(16) # skip header.
return np.fromfile(f, dtype=np.float32).reshape(h, w)
def db_lstm():
# 8 features
word = layers.data(name='word_data', shape=[1], data_type='int64')
predicate = layers.data(name='verb_data', shape=[1], data_type='int64')
ctx_n2 = layers.data(name='ctx_n2_data', shape=[1], data_type='int64')
ctx_n1 = layers.data(name='ctx_n1_data', shape=[1], data_type='int64')
ctx_0 = layers.data(name='ctx_0_data', shape=[1], data_type='int64')
ctx_p1 = layers.data(name='ctx_p1_data', shape=[1], data_type='int64')
ctx_p2 = layers.data(name='ctx_p2_data', shape=[1], data_type='int64')
mark = layers.data(name='mark_data', shape=[1], data_type='int64')
predicate_embedding = layers.embedding(
input=predicate,
size=[pred_len, word_dim],
data_type='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'vemb'})
mark_embedding = layers.embedding(
input=mark,
size=[mark_dict_len, mark_dim],
data_type='float32',
is_sparse=IS_SPARSE)
word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
emb_layers = [
layers.embedding(
size=[word_dict_len, word_dim],
input=x,
param_attr={'name': embedding_name,
'trainable': False}) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
hidden_0_layers = [
layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
]
hidden_0 = layers.sums(input=hidden_0_layers)
lstm_0 = layers.dynamic_lstm(
input=hidden_0,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid')
# stack L-LSTM and R-LSTM with direct edges
input_tmp = [hidden_0, lstm_0]
for i in range(1, depth):
mix_hidden = layers.sums(input=[
layers.fc(input=input_tmp[0], size=hidden_dim),
layers.fc(input=input_tmp[1], size=hidden_dim)
])
lstm = layers.dynamic_lstm(
input=mix_hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
is_reverse=((i % 2) == 1))
input_tmp = [mix_hidden, lstm]
feature_out = layers.sums(input=[
layers.fc(input=input_tmp[0], size=label_dict_len),
layers.fc(input=input_tmp[1], size=label_dict_len)
])
return feature_out
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
# define network topology
feature_out = db_lstm()
target = layers.data(name='target', shape=[1], data_type='int64')
crf_cost = layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr={"name": 'crfw',
"learning_rate": mix_hidden_lr})
avg_cost = layers.mean(x=crf_cost)
# TODO(qiao)
# 1. add crf_decode_layer and evaluator
# 2. use other optimizer and check why out will be NAN
sgd_optimizer = SGDOptimizer(learning_rate=0.0001)
opts = sgd_optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE)
place = core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
embedding_param = g_scope.find_var(embedding_name).get_tensor()
embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
batch_id = 0
for pass_id in xrange(PASS_NUM):
for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
verb_data = to_lodtensor(map(lambda x: x[6], data), place)
mark_data = to_lodtensor(map(lambda x: x[7], data), place)
target = to_lodtensor(map(lambda x: x[8], data), place)
outs = exe.run(framework.default_main_program(),
feed={
'word_data': word_data,
'ctx_n2_data': ctx_n2_data,
'ctx_n1_data': ctx_n1_data,
'ctx_0_data': ctx_0_data,
'ctx_p1_data': ctx_p1_data,
'ctx_p2_data': ctx_p2_data,
'verb_data': verb_data,
'mark_data': mark_data,
'target': target
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
if batch_id % 10 == 0:
print("avg_cost=" + str(avg_cost_val))
# exit early for CI
exit(0)
batch_id = batch_id + 1
if __name__ == '__main__':
main()
......@@ -28,8 +28,8 @@ def relu(x):
class TestGRUUnitOp(OpTest):
batch_size = 3
frame_size = 5
batch_size = 5
frame_size = 10
activate = {
GRUActivationType.identity: identity,
GRUActivationType.sigmoid: sigmoid,
......@@ -77,7 +77,7 @@ class TestGRUUnitOp(OpTest):
c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
g[:, frame_size * 2:])
g = np.hstack((u_r, c))
h = u * h_p + (1 - u) * c
h = u * c + (1 - u) * h_p
self.outputs = {
'Gate': g.astype('float64'),
'ResetHiddenPrev': r_h_p.astype('float64'),
......@@ -92,10 +92,7 @@ class TestGRUUnitOp(OpTest):
self.check_output()
def test_check_grad(self):
self.check_grad(
['Input', 'HiddenPrev', 'Weight'],
['Hidden', 'ResetHiddenPrev', 'Gate'],
max_relative_error=0.007)
self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
class TestGRUUnitOpWithBias(TestGRUUnitOp):
......@@ -104,18 +101,20 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
frame_size = self.frame_size
super(TestGRUUnitOpWithBias, self).set_inputs()
self.inputs['Bias'] = np.random.uniform(
-0.1, 0.1, (1, frame_size * 3)).astype('float32')
-0.1, 0.1, (1, frame_size * 3)).astype('float64')
self.attrs = {
'activation': GRUActivationType.identity,
'gate_activation': GRUActivationType.sigmoid
}
def test_check_grad(self):
self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
def test_check_grad_ingore_input(self):
self.check_grad(
['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
max_relative_error=0.007)
['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
no_grad_set=set('Input'))
if __name__ == '__main__':
exit(0) # FIXME(yuyang18): This unittest is not pass. Fix it later
unittest.main()
import unittest
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.framework import Program
import paddle.v2.fluid.core as core
import unittest
class TestBook(unittest.TestCase):
......@@ -20,7 +20,8 @@ class TestBook(unittest.TestCase):
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost)
program.append_backward(avg_cost)
print str(program)
# print str(program)
def test_recognize_digits_mlp(self):
program = Program()
......@@ -49,7 +50,7 @@ class TestBook(unittest.TestCase):
input=predict, label=label, main_program=program)
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost)
print str(program)
# print str(program)
def test_simple_conv2d(self):
program = Program()
......@@ -64,7 +65,7 @@ class TestBook(unittest.TestCase):
filter_size=[4, 4],
main_program=program)
print str(program)
# print str(program)
def test_recognize_digits_conv(self):
program = Program()
......@@ -103,7 +104,7 @@ class TestBook(unittest.TestCase):
program.append_backward(avg_cost)
print str(program)
# print str(program)
def test_word_embedding(self):
program = Program()
......@@ -164,7 +165,24 @@ class TestBook(unittest.TestCase):
avg_cost = layers.mean(x=cost, main_program=program)
self.assertIsNotNone(avg_cost)
print str(program)
# print str(program)
def test_linear_chain_crf(self):
program = Program()
# Change g_program, so the rest layers use `g_program`
images = layers.data(
name='pixel',
shape=[784],
data_type='float32',
main_program=program)
label = layers.data(
name='label', shape=[1], data_type='int32', main_program=program)
hidden = layers.fc(input=images, size=128, main_program=program)
crf = layers.linear_chain_crf(
input=hidden, label=label, main_program=program)
# print str(program)
if __name__ == '__main__':
......
......@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest):
transition_exps = np.exp(transition)
labels = np.random.randint(
low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
self.inputs = {
"Emission": (emission, lod),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册