diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..a498e882a3d85a33d44dbad7474fa2a340e33976
--- /dev/null
+++ b/doc/design/ops/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1-th level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2-th level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2-th level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2-th level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/design/ops/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/design/ops/images/2_level_rnn.png differ
diff --git a/doc/design/ops/images/rnn.dot b/doc/design/ops/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/design/ops/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/design/ops/images/rnn.jpg b/doc/design/ops/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/design/ops/images/rnn.jpg differ
diff --git a/doc/design/ops/images/rnn.png b/doc/design/ops/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/design/ops/images/rnn.png differ
diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/design/ops/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/design/ops/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/design/ops/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/design/ops/images/rnn_2level_data.png differ
diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..a78eea7d45e9e9553d153170aa31da55ec6e8289
--- /dev/null
+++ b/doc/design/ops/rnn.md
@@ -0,0 +1,153 @@
+# RNNOp design
+
+This document is about an RNN operator which requires that instances in a mini-batch have the same length.  We will have a more flexible RNN operator.
+
+## RNN Algorithm Implementation
+
+<p aligh="center">
+<img src="./images/rnn.jpg"/>
+</p>
+
+The above diagram shows an RNN unrolled into a full network.
+
+There are several important concepts:
+
+- *step-net*: the sub-graph to run at each step,
+- *memory*, $h_t$, the state of the current step,
+- *ex-memory*, $h_{t-1}$, the state of the previous step,
+- *initial memory value*, the ex-memory of the first step.
+
+### Step-scope
+
+There could be local variables defined in step-nets.  PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step.
+
+<p aligh="center">
+<img src="./images/rnn.png"/><br/>
+Figure 2 the RNN's data flow
+</p>
+
+Please be aware that all steps run the same step-net.  Each step
+
+1. creates the step-scope,
+2. realizes local variables, including step-outputs, in the step-scope, and
+3. runs the step-net, which could use these variables.
+
+The RNN operator will compose its output from step outputs in step scopes.
+
+### Memory and Ex-memory
+
+Let's give more details about memory and ex-memory via a simply example:
+
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively.
+
+In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step,
+or copy the value of the previous memory value to the current ex-memory variable.
+
+### Usage in Python
+
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+We can define an RNN's step-net using Block:
+
+```python
+import paddle as pd
+
+X = some_op() # x is some operator's output, and is a LoDTensor
+a = some_op()
+
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+    x = rnn.add_input(X)
+    # declare a memory (rnn's step)
+    h = rnn.add_memory(init=a)
+    # h.pre_state() means previous memory of rnn
+    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+    # update current memory
+    h.update(new_state)
+    # indicate that h variables in all step scopes should be merged
+    rnn.add_outputs(h)
+
+out = rnn()
+```
+
+Python API functions in above example:
+
+- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory` creates a variable used as the memory.
+- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output.
+
+### Nested RNN and LoDTensor
+
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences.
+
+The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text.
+
+<p aligh="center">
+<img src="./images/2_level_rnn.png"/>
+</p>
+
+```python
+import paddle as pd
+
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+
+# a is output of some op
+a = some_op()
+
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+
+def lower_level_rnn(paragraph):
+    '''
+    x: the input
+    '''
+    rnn = pd.create_rnn_op(output_num=1)
+    with rnn.stepnet():
+        sentence = rnn.add_input(paragraph, level=0)
+        h = rnn.add_memory(shape=[20, 30])
+        h.update(
+            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+        # get the last state as sentence's info
+        rnn.add_outputs(h)
+    return rnn
+
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+    paragraph_data = rnn.add_input(chapter_data, level=1)
+    low_rnn = lower_level_rnn(paragraph_data)
+    paragraph_out = low_rnn()
+
+    h = rnn.add_memory(init=a)
+    h.update(
+        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+    top_level_rnn.add_outputs(h)
+
+# just output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+
+in above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+
+By default, the `RNNOp` will concatenate the outputs from all the time steps,
+if the `output_all_steps` set to False, it will only output the final time step.
+
+
+<p align="center">
+<img src="images/rnn_2level_data.png"/>
+</p>
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ca04d402879b6a955d849a32175194df82b65c8
--- /dev/null
+++ b/paddle/operators/accuracy_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/accuracy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AccuracyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Inference"),
+                            "Input of Inference must be initialized.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Label"),
+                            "Input of Inference must be initialized.");
+    auto *inference = ctx.Input<framework::Tensor>("Inference");
+    auto *label = ctx.Input<framework::Tensor>("Label");
+
+    PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label must be a vector");
+    PADDLE_ENFORCE_EQ(inference->dims()[0], label->dims()[0],
+                      "inference size must be the same as label size");
+
+    ctx.Output<Tensor>("Accuracy")->Resize({1});
+  }
+};
+
+class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AccuracyOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // TODO(typhoonzero): support both inference value and indices.
+    AddInput("Inference", "topk(indices) the network output");
+    AddInput("Label", "Label of the training data");
+    // TODO(typhoonzero): AddInput("Weight", ...
+    AddOutput("Accuracy", "The accuracy of current batch");
+
+    AddComment(
+        R"DOC(Accuracy. It will print accuracy rate for classification.
+The accuracy is:
+..  math::
+accuracy = \\frac{NumOfCorrectPredicts}{NumOfAllSamples})DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
+REGISTER_OP_CPU_KERNEL(accuracy,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4e6d1ef9654012ce6355cbd7561c4fdc1785c11a
--- /dev/null
+++ b/paddle/operators/accuracy_op.cu
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/accuracy_op.h"
+
+namespace paddle {
+namespace operators {
+
+__global__ void AccuracySingleKernel(const int N, const int D, const int top_k,
+                                     const int* Xdata, const int* labelData,
+                                     float* accuracy) {
+  int correct = 0;
+  for (int row = 0; row < N; row++) {
+    const int label = labelData[row];
+    for (int col = 0; col < D; col++) {
+      const int pred = Xdata[row * D + col];
+      if (pred == label) {
+        ++correct;
+        break;
+      }
+    }
+  }
+  *accuracy = static_cast<float>(correct) / static_cast<float>(N);
+}
+
+template <typename T>
+class AccuracyOpCUDAKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto* inference = ctx.Input<Tensor>("Inference");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    // FIXME(typhoonzero): only support indices currently
+    // if add support for output values, how to detect the data type?
+    const int* inference_data = inference->data<int>();
+    const int* label_data = label->data<int>();
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+
+    size_t num_samples = inference->dims()[0];
+    size_t infer_width = inference->dims()[1];
+    cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+
+    if (num_samples == 0) {
+      return;
+    }
+
+    AccuracySingleKernel<<<1, 1>>>(num_samples, infer_width, 1, inference_data,
+                                   label_data, accuracy_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(accuracy,
+                       paddle::operators::AccuracyOpCUDAKernel<float>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe704efe1c979f4fc6a5a37184e51b416f5e517f
--- /dev/null
+++ b/paddle/operators/accuracy_op.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class AccuracyKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Inference");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+
+    const T* inference_data = inference->data<T>();
+    const T* label_data = label->data<T>();
+
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    *accuracy_data = 0.0f;
+
+    if (num_samples == 0) {
+      return;
+    }
+
+    int num_correct = 0;
+    // assume inference is already the topk of the output
+    for (size_t i = 0; i < num_samples; ++i) {
+      PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0");
+      for (size_t j = 0; j < class_dim; ++j) {
+        if (inference_data[i * class_dim + j] == label_data[i]) {
+          ++num_correct;
+          break;
+        }
+      }
+    }
+
+    // FIXME(typhoonzero): we don't accumulate the accuracy for now.
+    *accuracy_data =
+        static_cast<float>(num_correct) / static_cast<float>(num_samples);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 7b856c9776cd0b168af7b53e8dfcbe81b5891add..253b17d8a1b88eccc58fc458ae8274d2bbd1c323 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -25,16 +25,30 @@ class CosSimOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    // notnull check
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
-    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
-                      ctx.Input<Tensor>("Y")->dims(),
-                      "Dimensions of Input(X) and Input(Y) must be the same.");
-
-    auto dims = ctx.Input<Tensor>("X")->dims();
-    ctx.Output<framework::LoDTensor>("Out")->Resize({dims[0], 1});
-    ctx.Output<framework::LoDTensor>("XNorm")->Resize({dims[0], 1});
-    ctx.Output<framework::LoDTensor>("YNorm")->Resize({dims[0], 1});
+
+    // shape check
+    auto x_dims = ctx.Input<Tensor>("X")->dims();
+    auto y_dims = ctx.Input<Tensor>("Y")->dims();
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+
+    // resize tensor
+    ctx.Output<framework::LoDTensor>("Out")->Resize({x_dims[0], 1});
+    ctx.Output<framework::LoDTensor>("XNorm")->Resize({x_dims[0], 1});
+    ctx.Output<framework::LoDTensor>("YNorm")->Resize({y_dims[0], 1});
   }
 };
 
@@ -42,16 +56,27 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   CosSimOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of cos_sim op.");
-    AddInput("Y", "The second input of cos_sim op.");
+    AddInput("X", "The 1st input of cos_sim op.");
+    AddInput("Y", "The 2nd input of cos_sim op.");
     AddOutput("Out", "The output of cos_sim op.");
-    AddOutput("XNorm", "Row norm of the first input.").AsIntermediate();
-    AddOutput("YNorm", "Row norm of the second input.").AsIntermediate();
+    AddOutput("XNorm",
+              "Norm of the first input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
+    AddOutput("YNorm",
+              "Norm of the second input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
 
     AddComment(R"DOC(
 Cosine Similarity Operator.
 
-The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y))
+The equation is: Out = X^T * Y / (sqrt(X^T * X) * sqrt(Y^T * Y)).
+
+Input(X) and Input(Y) must have the same shape, except that the 1st dimension
+of Input(Y) could be just 1 (different from Input(X)), which will be
+broadcasted to match the shape of Input(X) before computing their cosine
+similarity.
 )DOC");
   }
 };
@@ -62,32 +87,50 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(const framework::InferShapeContext &ctx) const override {
+    // notnull check
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Y"), "Input(Y) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("XNorm"),
                             "Input(XNorm) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("YNorm"),
                             "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar("Out"),
+                            "Input(Out) must not be null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Out")),
                             "Input(Out@GRAD) must not be null.");
 
+    // shape check
     auto x_dims = ctx.Input<Tensor>("X")->dims();
     auto y_dims = ctx.Input<Tensor>("Y")->dims();
     auto xnorm_dims = ctx.Input<Tensor>("XNorm")->dims();
     auto ynorm_dims = ctx.Input<Tensor>("YNorm")->dims();
-    auto out_dims = ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      "Dimensions of Input(X) and Input(Y) must be the same.");
-    PADDLE_ENFORCE_EQ(xnorm_dims[0], x_dims[0],
-                      "1st dimension of XNorm must equal that of Input(X).");
-    PADDLE_ENFORCE_EQ(xnorm_dims[1], 1, "2st dimension of XNorm must be one.");
-    PADDLE_ENFORCE_EQ(ynorm_dims[0], y_dims[0],
-                      "1st dimension of YNorm must equal that of Input(Y).");
-    PADDLE_ENFORCE_EQ(ynorm_dims[1], 1, "2st dimension of YNorm must be one.");
-    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
-                      "1st dimension of Out@GRAD must equal that of Input(X)");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1, "1st dimension of Out@GRAD must be one.");
-
+    auto out_dims = ctx.Input<Tensor>("Out")->dims();
+    auto out_grad_dims =
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->dims();
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+    auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1});
+    auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1});
+    PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims,
+                      "Shape of Input(XNorm) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims,
+                      "Shape of Input(YNorm) must be [Y.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims,
+                      "Shape of Input(Out) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims,
+                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
+
+    // resize tensor
     auto *x_grad =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     auto *y_grad =
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index 0dc509952578497671a128374f77ce616a520909..318b63f3707cf77755de773a39b00aa30d2296d3 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -31,30 +31,38 @@ template <typename Place, typename T>
 class CosSimKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x = context.Input<Tensor>("X");
-    auto* input_y = context.Input<Tensor>("Y");
-    auto* output_z = context.Output<Tensor>("Out");
-    auto* output_x_norm = context.Output<Tensor>("XNorm");
-    auto* output_y_norm = context.Output<Tensor>("YNorm");
+    // get Tensor
+    auto* in_x = context.Input<Tensor>("X");
+    auto* in_y = context.Input<Tensor>("Y");
+    auto* out_z = context.Output<Tensor>("Out");
+    auto* out_x_norm = context.Output<Tensor>("XNorm");
+    auto* out_y_norm = context.Output<Tensor>("YNorm");
+    out_z->mutable_data<T>(context.GetPlace());
+    out_x_norm->mutable_data<T>(context.GetPlace());
+    out_y_norm->mutable_data<T>(context.GetPlace());
 
-    output_z->mutable_data<T>(context.GetPlace());
-    output_x_norm->mutable_data<T>(context.GetPlace());
-    output_y_norm->mutable_data<T>(context.GetPlace());
-
-    auto dims = input_x->dims();
-    int64_t size = input_x->numel();
-    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
-    auto x = EigenMatrix<T>::From(*input_x, new_dims);
-    auto y = EigenMatrix<T>::From(*input_y, new_dims);
-    auto z = EigenVector<T>::Flatten(*output_z);
-    auto x_norm = EigenVector<T>::Flatten(*output_x_norm);
-    auto y_norm = EigenVector<T>::Flatten(*output_y_norm);
+    // convert Tensor to Eigen Tensor
+    int rows_x = in_x->dims()[0];
+    int rows_y = in_y->dims()[0];
+    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
+    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
+    auto z = EigenVector<T>::Flatten(*out_z);
+    auto x_norm = EigenVector<T>::Flatten(*out_x_norm);
+    auto y_norm = EigenVector<T>::Flatten(*out_y_norm);
 
+    // compute
     auto place = context.GetEigenDevice<Place>();
-    auto xy = (x * y).sum(Eigen::array<int, 1>({{1}}));
-    x_norm.device(place) = x.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
-    y_norm.device(place) = y.square().sum(Eigen::array<int, 1>({{1}})).sqrt();
-    z.device(place) = xy / x_norm / y_norm;
+    auto row_along = Eigen::array<int, 1>({{1}});
+    x_norm.device(place) = x.square().sum(row_along).sqrt();
+    y_norm.device(place) = y.square().sum(row_along).sqrt();
+    if (rows_x == rows_y) {
+      auto xy = (x * y).sum(Eigen::array<int, 1>({1}));
+      z.device(place) = xy / x_norm / y_norm;
+    } else {
+      Eigen::DSizes<int, 2> bcast(rows_x, 1);
+      auto xy = (x * y.broadcast(bcast)).sum(row_along);
+      z.device(place) = xy / x_norm / y_norm.broadcast(bcast);
+    }
   }
 };
 
@@ -62,43 +70,72 @@ template <typename Place, typename T>
 class CosSimGradKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input_x = context.Input<Tensor>("X");
-    auto* input_y = context.Input<Tensor>("Y");
-    auto* input_z = context.Input<Tensor>("Out");
-    auto* input_x_norm = context.Input<Tensor>("XNorm");
-    auto* input_y_norm = context.Input<Tensor>("YNorm");
-    auto* output_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* output_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto* input_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+    // get Tensor
+    auto* in_x = context.Input<Tensor>("X");
+    auto* in_y = context.Input<Tensor>("Y");
+    auto* in_z = context.Input<Tensor>("Out");
+    auto* in_x_norm = context.Input<Tensor>("XNorm");
+    auto* in_y_norm = context.Input<Tensor>("YNorm");
+    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
 
-    auto dims = input_x->dims();
-    int64_t size = input_x->numel();
-    auto new_dims = framework::make_ddim({dims[0], size / dims[0]});
-    auto x = EigenMatrix<T>::From(*input_x, new_dims);
-    auto y = EigenMatrix<T>::From(*input_y, new_dims);
-    auto z = EigenMatrix<T>::From(*input_z);
-    auto x_norm = EigenMatrix<T>::From(*input_x_norm);
-    auto y_norm = EigenMatrix<T>::From(*input_y_norm);
-    auto dz = EigenMatrix<T>::From(*input_grad_z);
+    // convert Tensor to Eigen Tensor
+    auto x = EigenMatrix<T>::Reshape(*in_x, 1);
+    auto y = EigenMatrix<T>::Reshape(*in_y, 1);
+    auto z = EigenMatrix<T>::Reshape(*in_z, 1);
+    auto x_norm = EigenMatrix<T>::Reshape(*in_x_norm, 1);
+    auto y_norm = EigenMatrix<T>::Reshape(*in_y_norm, 1);
+    auto dz = EigenMatrix<T>::Reshape(*in_grad_z, 1);
 
-    Eigen::DSizes<int, 2> bcast(1, new_dims[1]);
-    auto z_bcast = z.broadcast(bcast);
-    auto dz_bcast = dz.broadcast(bcast);
+    // compute gradident
+    int rows_x = in_x->dims()[0];
+    int rows_y = in_y->dims()[0];
+    int cols = framework::product(in_x->dims()) / rows_x;
+    Eigen::DSizes<int, 2> bcast_cols(1, cols);
+    auto z_bcast = z.broadcast(bcast_cols);
+    auto dz_bcast = dz.broadcast(bcast_cols);
+    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols);
     auto place = context.GetEigenDevice<Place>();
-    auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast);
-    auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast);
-    auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast);
-    if (output_grad_x) {
-      output_grad_x->mutable_data<T>(context.GetPlace());
-      auto dx = EigenMatrix<T>::From(*output_grad_x, new_dims);
-      dx.device(place) =
-          dz_bcast * (y / norm_prod_bcast - z_bcast * x / x_snorm_bcast);
-    }
-    if (output_grad_y) {
-      output_grad_y->mutable_data<T>(context.GetPlace());
-      auto dy = EigenMatrix<T>::From(*output_grad_y, new_dims);
-      dy.device(place) =
-          dz_bcast * (x / norm_prod_bcast - z_bcast * y / y_snorm_bcast);
+    if (rows_x == rows_y) {
+      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols);
+      auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols);
+      // compute dx
+      if (out_grad_x) {
+        out_grad_x->mutable_data<T>(context.GetPlace());
+        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
+        auto grad = y / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
+        dx.device(place) = dz_bcast * grad;
+      }
+      // compute dy
+      if (out_grad_y) {
+        out_grad_y->mutable_data<T>(context.GetPlace());
+        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto grad = x / norm_prod_bcast - z_bcast * y / y_snorm_bcast;
+        dy.device(place) = dz_bcast * grad;
+      }
+    } else {
+      Eigen::DSizes<int, 2> bcast_rows(rows_x, 1);
+      Eigen::DSizes<int, 2> bcast_rows_cols(rows_x, cols);
+      auto y_bcast = y.broadcast(bcast_rows);
+      auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_rows_cols);
+      auto norm_prod_bcast = (x_norm * y_norm.eval().broadcast(bcast_rows))
+                                 .eval()
+                                 .broadcast(bcast_cols);
+      // compute dx
+      if (out_grad_x) {
+        out_grad_x->mutable_data<T>(context.GetPlace());
+        auto dx = EigenMatrix<T>::Reshape(*out_grad_x, 1);
+        auto grad = y_bcast / norm_prod_bcast - z_bcast * x / x_snorm_bcast;
+        dx.device(place) = dz_bcast * grad;
+      }
+      // compute dy
+      if (out_grad_y) {
+        out_grad_y->mutable_data<T>(context.GetPlace());
+        auto dy = EigenMatrix<T>::Reshape(*out_grad_y, 1);
+        auto grad = x / norm_prod_bcast - z_bcast * y_bcast / y_snorm_bcast;
+        dy.device(place) = (dz_bcast * grad).sum(Eigen::array<int, 1>({0}));
+      }
     }
   }
 };
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 10ba3ca5cad1c6f999b05e7bd8420841b2738a6c..342f10d57f6ca388951dbdfd76e110f1b1bd0c38 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -29,7 +29,7 @@ class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(X->dims().size(), 2, "X's dimension must be 2.");
     PADDLE_ENFORCE_EQ(label->dims().size(), 1, "label's dimension must be 1.");
     PADDLE_ENFORCE_EQ(X->dims()[0], label->dims()[0]);
-    ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0]});
+    ctx.Output<framework::LoDTensor>("Y")->Resize({X->dims()[0], 1});
   }
 };
 
diff --git a/paddle/operators/name_convention.md b/paddle/operators/name_convention.md
index a090e0b5450509affdd739f63df618595f204f97..379385dc5d914101c7b5c9494f9383b6cf6a9b79 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -38,9 +38,11 @@ public:
   AccumulateOpMaker(framework::OpProto *proto,
                             framework::OpAttrChecker *op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor. If the output size is not the same as input size, the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor. 
+    If the output size is not the same as input size, 
+    the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
     AddOutput("Out", "(Tensor) Accumulated output tensor");
-    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier");
+    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
     AddComment(R"DOC(
 Accumulate operator accumulates the input tensor to the output tensor. If the
 output tensor already has the right size, we add to it; otherwise, we first
@@ -51,7 +53,7 @@ Accumulation is done as shown:
 
 Out = 1*X + gamma*Out
 
-where X is the input tensor, Y is the output tensor and gamma is the multiplier
+where X is the input tensor, Out is the output tensor and gamma is the multiplier
 argument.
 )DOC");
   }
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 926e60142615e200b948d6487e40e56ff95cd777..cbee173b3742b3ce7bd477a117653be28f06e989 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -52,6 +52,7 @@ USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_OP(pad);
 USE_CPU_ONLY_OP(scatter);
+USE_OP(accuracy);
 USE_CPU_ONLY_OP(concat);
 USE_OP(top_k);
 USE_OP(squared_l2_distance);
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/framework/tests/test_accuracy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d60eb90d5edbd6944a11f7555f0291720dd2be
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -0,0 +1,25 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        infer = np.random.randint(0, 2, (32, 1)).astype("int")
+        label = np.random.randint(0, 2, (32, )).astype("int")
+        self.inputs = {'Inference': infer, "Label": label}
+        num_correct = 0
+        for rowid in xrange(32):
+            for ele in infer[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {'Accuracy': [num_correct / 32.0]}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/framework/tests/test_cos_sim_op.py
index 797cbd8cc5cf7f73d58ca713d02667731d5c8a0e..d314ce391ea2f10a8bd77c24e84fa3e1eebb6c73 100644
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
@@ -7,8 +7,8 @@ class TestCosSimOp(OpTest):
     def setUp(self):
         self.op_type = "cos_sim"
         self.inputs = {
-            'X': np.random.random((10, 5)).astype("float32"),
-            'Y': np.random.random((10, 5)).astype("float32")
+            'X': np.random.random((6, 5)).astype("float32"),
+            'Y': np.random.random((6, 5)).astype("float32")
         }
         expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
         expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
@@ -28,12 +28,66 @@ class TestCosSimOp(OpTest):
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
 
-    def test_check_grad_ignore_y(self):
+    def test_check_grad_ingore_y(self):
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
 
 
-if __name__ == "__main__":
+class TestCosSimOp2(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5)).astype("float32"),
+            'Y': np.random.random((1, 5)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+class TestCosSimOp3(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5, 2)).astype("float32"),
+            'Y': np.random.random((6, 5, 2)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+class TestCosSimOp4(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5, 2)).astype("float32"),
+            'Y': np.random.random((1, 5, 2)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index c2fc102a8b8de82da5c3fc5fee273790325908f8..253e7b8a24465da63a7eacd7983eb831251e6230 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -8,20 +8,22 @@ class TestCrossEntropy(OpTest):
         self.op_type = "onehot_cross_entropy"
         batch_size = 30
         class_num = 10
+
         X = numpy.random.uniform(0.1, 1.0,
                                  [batch_size, class_num]).astype("float32")
-        label = (class_num / 2) * numpy.ones(batch_size).astype("int32")
-        self.inputs = {'X': X, 'label': label}
-        Y = []
-        for i in range(0, batch_size):
-            Y.append(-numpy.log(X[i][label[i]]))
-        self.outputs = {'Y': numpy.array(Y).astype("float32")}
+        labels = numpy.random.randint(0, class_num, batch_size, dtype="int32")
+
+        cross_entropy = numpy.asmatrix(
+            [[-numpy.log(X[i][labels[i]])] for i in range(X.shape[0])],
+            dtype="float32")
+        self.inputs = {"X": X, "label": labels}
+        self.outputs = {"Y": cross_entropy}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
+        self.check_grad(["X"], "Y")
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_pad_op.py b/python/paddle/v2/framework/tests/test_pad_op.py
index 456b765e331fc4c80e6fd817c88d7ec533158ecb..9052e63b5683801da7c73be4de23013c949add98 100644
--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
@@ -22,7 +22,7 @@ class TestPadOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
 
     def initTestCase(self):
         self.shape = (16, 16)