diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
index 89c027ff1eea93012dc5ab22b081786efc328e96..877c969103cfc17e1b170449d1922d9c7db2a58b 100644
--- a/paddle/operators/gru_unit_op.cc
+++ b/paddle/operators/gru_unit_op.cc
@@ -114,18 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(sigmoid)
         .InEnum({identity, sigmoid, tanh, relu});
     AddComment(R"DOC(
-GRUUnit Operator.
-
-This operator implements partial calculations of the GRU unit as follows:
+GRUUnit Operator implements partial calculations of the GRU unit as following:
 
 $$
-update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r)  \\
-output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
 $$
 
-The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
   }
@@ -150,12 +151,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
                    "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                    "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Gate");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                    "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
                    "Hidden");
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index c53e7d9827e0395e6ce613302e732b2797f83cdd..050430d3252d05236219cd5ced5a792c21413c1f 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -110,7 +110,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // calculate final output
-    h.device(place) = u * (h_p - c) + c;
+    h.device(place) = u * (c - h_p) + h_p;
   }
 };
 
@@ -146,35 +146,27 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     auto* weight_grad =
         context.Output<Tensor>(framework::GradVarName("Weight"));
     auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
-    input_grad->mutable_data<T>(context.GetPlace());
-    hidden_prev_grad->mutable_data<T>(context.GetPlace());
-    weight_grad->mutable_data<T>(context.GetPlace());
     Tensor gate_grad;
-    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     Tensor reset_hidden_prev_grad;
-    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
-                                           context.GetPlace());
-
-    int batch_size = input->dims()[0];
-    int frame_size = hidden_prev->dims()[1];
 
     const T* hidden_prev_data = hidden_prev->data<T>();
-    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
     const T* weight_data = weight->data<T>();
-    T* weight_grad_data = weight_grad->data<T>();
-    T* gate_grad_data = gate_grad.data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
     const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());
 
     auto h_p = EigenMatrix<T>::From(*hidden_prev);
     auto g = EigenMatrix<T>::From(*gate);
     auto d_h = EigenMatrix<T>::From(*hidden_grad);
-    auto d_x = EigenMatrix<T>::From(*input_grad);
-    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
     auto d_g = EigenMatrix<T>::From(gate_grad);
     auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
     auto place = context.GetEigenDevice<Place>();
 
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
     Eigen::array<int, 2> u_offsets({{0, 0}});
     auto u = g.slice(u_offsets, extents);  // update gate
@@ -185,38 +177,52 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
 
     // backward for unactivated update gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
     // backward for unactivated output candidate
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+                   d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
     math::gemm<Place, T>(context.device_context(), false, true, batch_size,
                          frame_size, frame_size, 1,
                          gate_grad_data + frame_size * 2, frame_size * 3,
                          weight_data + frame_size * frame_size * 2, frame_size,
                          0, reset_hidden_prev_grad_data, frame_size);
-    // backward for state_weight
-    math::gemm<Place, T>(
-        context.device_context(), true, false, frame_size, frame_size,
-        batch_size, 1, reset_hidden_prev_data, frame_size,
-        gate_grad_data + frame_size * 2, frame_size * 3, 0,
-        weight_grad_data + frame_size * frame_size * 2, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
-    // backward for update_gate_weight and reset_gate_weight
-    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
-                         frame_size * 2, batch_size, 1, hidden_prev_data,
-                         frame_size, gate_grad_data, frame_size * 3, 0,
-                         weight_grad_data, frame_size * 2);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<Place, T>(
+          context.device_context(), true, false, frame_size, frame_size,
+          batch_size, 1, reset_hidden_prev_data, frame_size,
+          gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                           frame_size * 2, batch_size, 1, hidden_prev_data,
+                           frame_size, gate_grad_data, frame_size * 3, 0,
+                           weight_grad_data, frame_size * 2);
+    }
     // backward for hidden_prev
-    d_h_p.device(place) = d_r_h_p * r + d_h * u;
-    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
-                         frame_size, frame_size * 2, 1, gate_grad_data,
-                         frame_size * 3, weight_data, frame_size * 2, 1,
-                         hidden_prev_grad_data, frame_size);
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                           frame_size, frame_size * 2, 1, gate_grad_data,
+                           frame_size * 3, weight_data, frame_size * 2, 1,
+                           hidden_prev_grad_data, frame_size);
+    }
     // backward for input
-    d_x.device(place) = d_g;
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
     // backward for bias
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index ddf73981751798c72cef08f2dd5c87580b45aec3..872f659fed40d7479d9d8bed6c8469fb28282253 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -271,7 +271,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     ll -= std::log(sum);
     // Now ll is equal to -log(Z).
 
-    const int* lbl = label.data<int>();
+    const int64_t* lbl = label.data<int64_t>();
     PADDLE_ENFORCE_LT(
         static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
         "An invalid tag label that execesses the largest tag number.");
@@ -449,7 +449,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                            Tensor* emission_grad) const {
     const T* w_exps = transition_exps.data<T>();
     const T* x_exps = emission_exps.data<T>();
-    const int* label_value = label.data<int>();
+    const int64_t* label_value = label.data<int64_t>();
     T* beta_value = beta->data<T>();
 
     auto x_dims = emission_exps.dims();
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index 5697eaa460cf21bf73add1b460947e4f3d4edfc3..e40551ca73e991edd8e1d1df5b103c36367b7050 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -126,7 +126,10 @@ class LayerHelper(object):
         self.startup_program.global_block().create_parameter(
             dtype=dtype, shape=shape, **attr_copy)
         return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'], dtype=dtype, shape=shape)
+            name=attr_copy['name'],
+            dtype=dtype,
+            shape=shape,
+            trainable=attr_copy.get('trainable', True))
 
     def create_tmp_variable(self, dtype):
         return self.main_program.current_block().create_var(
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index abd4b22e8b68f3b5c3e961df83db34e419e7f4d5..fac91aac97267b1ecc867bb9b0b1f8fd40f2f299 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -112,6 +112,7 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
+              param_initializer=None,
               param_attr=None,
               data_type='float32',
               main_program=None,
@@ -136,9 +137,16 @@ def embedding(input,
     to the LayerHelper constructor.
 
     """
+
+    def _get_default_param_initializer():
+        return XavierInitializer()
+
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
-        attr=helper.param_attr, shape=size, dtype=data_type)
+        attr=helper.param_attr,
+        shape=size,
+        dtype=data_type,
+        initializer=param_initializer or _get_default_param_initializer())
     tmp = helper.create_tmp_variable(data_type)
     helper.append_op(
         type='lookup_table',
@@ -460,6 +468,41 @@ def sums(input, main_program=None, startup_program=None):
     return out
 
 
+def linear_chain_crf(input,
+                     label,
+                     param_attr=None,
+                     param_initializer=None,
+                     main_program=None,
+                     startup_program=None):
+    def _get_default_param_initializer():
+        return XavierInitializer()
+
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype(),
+        initializer=param_initializer or _get_default_param_initializer())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
 def assign(input, output, main_program=None, startup_program=None):
     helper = LayerHelper('assign', **locals())
     helper.append_op(
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index d2841df6af7a0d860c239db952c767c995d30ba4..87a478c2903b77d955ebde49a4a0e507c9e9ffd3 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -170,7 +170,8 @@ class Optimizer(object):
 
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is not None:
+            if param_and_grad[0].trainable is True and param_and_grad[
+                    1] is not None:
                 optimize_op = self._append_optimize_op(loss.block,
                                                        param_and_grad)
                 optimize_ops.append(optimize_op)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66e6e748b76dec53a9e24b5b352d31395ce6bde
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -0,0 +1,192 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor, g_scope
+from paddle.v2.fluid.optimizer import SGDOptimizer
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm():
+    # 8 features
+    word = layers.data(name='word_data', shape=[1], data_type='int64')
+    predicate = layers.data(name='verb_data', shape=[1], data_type='int64')
+    ctx_n2 = layers.data(name='ctx_n2_data', shape=[1], data_type='int64')
+    ctx_n1 = layers.data(name='ctx_n1_data', shape=[1], data_type='int64')
+    ctx_0 = layers.data(name='ctx_0_data', shape=[1], data_type='int64')
+    ctx_p1 = layers.data(name='ctx_p1_data', shape=[1], data_type='int64')
+    ctx_p2 = layers.data(name='ctx_p2_data', shape=[1], data_type='int64')
+    mark = layers.data(name='mark_data', shape=[1], data_type='int64')
+
+    predicate_embedding = layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        data_type='float32',
+        is_sparse=IS_SPARSE,
+        param_attr={'name': 'vemb'})
+
+    mark_embedding = layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        data_type='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr={'name': embedding_name,
+                        'trainable': False}) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = layers.sums(input=hidden_0_layers)
+
+    lstm_0 = layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = layers.sums(input=[
+            layers.fc(input=input_tmp[0], size=hidden_dim),
+            layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = layers.sums(input=[
+        layers.fc(input=input_tmp[0], size=label_dict_len),
+        layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    feature_out = db_lstm()
+    target = layers.data(name='target', shape=[1], data_type='int64')
+    crf_cost = layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr={"name": 'crfw',
+                    "learning_rate": mix_hidden_lr})
+    avg_cost = layers.mean(x=crf_cost)
+    # TODO(qiao)
+    #   1. add crf_decode_layer and evaluator
+    #   2. use other optimizer and check why out will be NAN
+    sgd_optimizer = SGDOptimizer(learning_rate=0.0001)
+    opts = sgd_optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    embedding_param = g_scope.find_var(embedding_name).get_tensor()
+    embedding_param.set(
+        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
+
+    batch_id = 0
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
+            ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
+            ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
+            ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
+            ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
+            verb_data = to_lodtensor(map(lambda x: x[6], data), place)
+            mark_data = to_lodtensor(map(lambda x: x[7], data), place)
+            target = to_lodtensor(map(lambda x: x[8], data), place)
+
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'word_data': word_data,
+                               'ctx_n2_data': ctx_n2_data,
+                               'ctx_n1_data': ctx_n1_data,
+                               'ctx_0_data': ctx_0_data,
+                               'ctx_p1_data': ctx_p1_data,
+                               'ctx_p2_data': ctx_p2_data,
+                               'verb_data': verb_data,
+                               'mark_data': mark_data,
+                               'target': target
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+
+            if batch_id % 10 == 0:
+                print("avg_cost=" + str(avg_cost_val))
+
+            # exit early for CI
+            exit(0)
+
+            batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
index f356f6e9ec0da2d3e1fb67638d81e8d54c544f53..501d5aa5797d6def708338692f0861657f951ef7 100644
--- a/python/paddle/v2/fluid/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
@@ -28,8 +28,8 @@ def relu(x):
 
 
 class TestGRUUnitOp(OpTest):
-    batch_size = 3
-    frame_size = 5
+    batch_size = 5
+    frame_size = 10
     activate = {
         GRUActivationType.identity: identity,
         GRUActivationType.sigmoid: sigmoid,
@@ -77,7 +77,7 @@ class TestGRUUnitOp(OpTest):
         c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
                                                     g[:, frame_size * 2:])
         g = np.hstack((u_r, c))
-        h = u * h_p + (1 - u) * c
+        h = u * c + (1 - u) * h_p
         self.outputs = {
             'Gate': g.astype('float64'),
             'ResetHiddenPrev': r_h_p.astype('float64'),
@@ -92,10 +92,7 @@ class TestGRUUnitOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight'],
-            ['Hidden', 'ResetHiddenPrev', 'Gate'],
-            max_relative_error=0.007)
+        self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
 
 
 class TestGRUUnitOpWithBias(TestGRUUnitOp):
@@ -104,18 +101,20 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid
         }
 
     def test_check_grad(self):
+        self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
+
+    def test_check_grad_ingore_input(self):
         self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
-            max_relative_error=0.007)
+            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            no_grad_set=set('Input'))
 
 
 if __name__ == '__main__':
-    exit(0)  # FIXME(yuyang18): This unittest is not pass. Fix it later
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 3d18e7ce3a4dc6c6b917a1000de39fca71f6ac18..d3dc45742d92dc61b81d9cdc04056c5d5bdc2b63 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,8 +1,8 @@
+import unittest
+
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
-import paddle.v2.fluid.core as core
-import unittest
 
 
 class TestBook(unittest.TestCase):
@@ -20,7 +20,8 @@ class TestBook(unittest.TestCase):
         avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
         program.append_backward(avg_cost)
-        print str(program)
+
+        # print str(program)
 
     def test_recognize_digits_mlp(self):
         program = Program()
@@ -49,7 +50,7 @@ class TestBook(unittest.TestCase):
             input=predict, label=label, main_program=program)
         avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
-        print str(program)
+        # print str(program)
 
     def test_simple_conv2d(self):
         program = Program()
@@ -64,7 +65,7 @@ class TestBook(unittest.TestCase):
             filter_size=[4, 4],
             main_program=program)
 
-        print str(program)
+        # print str(program)
 
     def test_recognize_digits_conv(self):
         program = Program()
@@ -103,7 +104,7 @@ class TestBook(unittest.TestCase):
 
         program.append_backward(avg_cost)
 
-        print str(program)
+        # print str(program)
 
     def test_word_embedding(self):
         program = Program()
@@ -164,7 +165,24 @@ class TestBook(unittest.TestCase):
         avg_cost = layers.mean(x=cost, main_program=program)
         self.assertIsNotNone(avg_cost)
 
-        print str(program)
+        # print str(program)
+
+    def test_linear_chain_crf(self):
+        program = Program()
+
+        # Change g_program, so the rest layers use `g_program`
+        images = layers.data(
+            name='pixel',
+            shape=[784],
+            data_type='float32',
+            main_program=program)
+        label = layers.data(
+            name='label', shape=[1], data_type='int32', main_program=program)
+        hidden = layers.fc(input=images, size=128, main_program=program)
+        crf = layers.linear_chain_crf(
+            input=hidden, label=label, main_program=program)
+
+        # print str(program)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
index 6f06a66c825b37ee91214efc0a29a58f0b9057f9..c26634ff20c46e484d600c758be386ec8327d1c1 100644
--- a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
@@ -104,7 +104,7 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),