diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 80f7b69c142eb02c64076c8724a16d09e31c72d0..2e16201e74c153888594ebe6679fb0036734dad4 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -37,13 +37,13 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x->dims()[0], label->dims()[0],
                       "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "If Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
+                        "If Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -63,6 +63,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                             "Input(Label) should be not null.");
     PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(framework::GradVarName("Y")),
                             "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE_NOT_NULL(ctx.OutputVar(framework::GradVarName("X")),
+                            "Output(X@GRAD) should be not null.");
 
     auto x = ctx.Input<Tensor>("X");
     auto label = ctx.Input<Tensor>("Label");
@@ -80,13 +82,13 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy->dims()[1], 1,
                       "The 2nd dimension of Input(Y@Grad) should be 1.");
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       PADDLE_ENFORCE_EQ(x->dims()[1], label->dims()[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "When Attr(softLabel) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label->dims()[1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "When Attr(softLabel) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -105,18 +107,19 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "where N is the batch size and D is the number of classes. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
-    AddInput("Label",
-             "(Tensor, default Tensor<int>), the ground truth which is "
-             "a 1-D or 2-D tensor. "
-             "When soft_label is set to 0, `Label` is a Tensor<int> with shape "
-             "[N x 1]. "
-             "When soft_label is set to 1, `Label` is a Tensor<float/double> "
-             "with shape [N x K].");
+    AddInput(
+        "Label",
+        "(Tensor, default Tensor<int>), the ground truth which is "
+        "a 2-D tensor. "
+        "When softLabel is set to false, `Label` is a Tensor<int> with shape "
+        "[N x 1]. "
+        "When softLabel is set to true, `Label` is a Tensor<float/double> "
+        "with shape [N x K].");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 1-D tensor "
+              "(Tensor, default Tensor<float>), a 2-D tensor "
               "with shape [N x 1]. The cross entropy loss.");
     AddAttr<bool>(
-        "soft_label",
+        "softLabel",
         "(bool, default false), a flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
@@ -126,12 +129,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    soft_label = False, Label[i, 0] indicates the class index for sample i:
+    softLabel = false, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    soft_label = True, Label[i, j] indicates the soft label of class j
+    softLabel = true, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 283021eae53bc49dc121f6ee2a1125a4cff6266e..18e44d77c9f62b296dc57952e546f844670c7d57 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -70,7 +70,7 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
 
 // TODO(qingqing): make zero setting a common function.
 template <typename T>
-__global__ void zero(T* X, const int N) {
+__global__ void Zero(T* X, const int N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     X[i] = 0.0;
@@ -108,18 +108,17 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto x = ctx.Input<Tensor>("X");
-    auto y = ctx.Output<Tensor>("Y");
-    auto label = ctx.Input<Tensor>("Label");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
 
-    auto* x_data = x->data<T>();
-    y->mutable_data<T>(ctx.GetPlace());
-    auto* y_data = y->data<T>();
+    const T* x_data = x->data<T>();
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
 
     int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
 
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
       int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
 
@@ -148,38 +147,41 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
 
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+    const T* dy_data =
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
 
-    int n = x->dims()[0];
-    int d = x->dims()[1];
+    int batch_size = x->dims()[0];
+    int class_num = x->dims()[1];
 
     int block = 512;
-    int grid = (n * d + block - 1) / block;
-    zero<T><<<grid, block, 0,
-              reinterpret_cast<const platform::CUDADeviceContext&>(
-                  ctx.device_context())
-                  .stream()>>>(dx_data, n * d);
-    if (ctx.Attr<bool>("soft_label")) {
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (ctx.Attr<bool>("softLabel")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(dx_data, dy_data, x_data, label_data,
-                                           n, d);
+                                           batch_size, class_num);
     } else {
+      Zero<T><<<grid, block, 0,
+                reinterpret_cast<const platform::CUDADeviceContext&>(
+                    ctx.device_context())
+                    .stream()>>>(dx_data, batch_size * class_num);
+
       auto* label_data = label->data<int>();
+      grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(dx_data, dy_data, x_data, label_data,
-                                           n, d);
+                                           batch_size, class_num);
     }
   }
 };
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 6b3f8c95bee58a3bf7d5f40ba84656beee6cdc97..255b2e9f5ea7566cca7fd3914e38da804b7c7006 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -42,14 +42,14 @@ class CrossEntropyOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+                   "This kernel only runs on CPU.");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* labels = ctx.Input<Tensor>("Label");
     Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
+    T* y_data = y->mutable_data<T>(ctx.GetPlace());
 
     const int batch_size = x->dims()[0];
-    if (ctx.Attr<bool>("soft_label")) {
+    if (ctx.Attr<bool>("softLabel")) {
       auto prob = EigenMatrix<T>::From(*x);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*y);
@@ -60,9 +60,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
     } else {
       const int class_num = x->dims()[1];
-
       const T* x_data = x->data<T>();
-      T* y_data = y->data<T>();
 
       const int* label_data = labels->data<int>();
       for (int i = 0; i < batch_size; ++i) {
@@ -78,33 +76,32 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
-
-    auto x = ctx.Input<Tensor>("X");
-    auto dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto label = ctx.Input<Tensor>("Label");
-
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    auto* dy_data = dy->data<T>();
-    auto* x_data = x->data<T>();
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
-    int batch_size = x->dims()[0];
     int class_num = x->dims()[1];
-
-    // TODO(qingqing): make zero setting an common function.
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = ctx.Input<Tensor>("Label")->data<T>();
-      int index = 0;
-      for (int i = 0; i < batch_size; ++i) {
-        for (int j = 0; j < class_num; ++j) {
-          dx_data[index] = -label_data[index] * dy_data[i] / x_data[index];
-          index++;
-        }
-      }
+    if (ctx.Attr<bool>("softLabel")) {
+      auto x_mat = EigenMatrix<T>::From(*x);
+      auto dy_mat = EigenMatrix<T>::From(*dy);
+      auto lbl_mat = EigenMatrix<T>::From(*label);
+      auto dx_mat = EigenMatrix<T>::From(*dx);
+
+      dx_mat.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+          -(lbl_mat * dy_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) /
+            x_mat);
     } else {
-      auto* label_data = label->data<int>();
+      int batch_size = x->dims()[0];
+      const T* dy_data = dy->data<T>();
+      const T* x_data = x->data<T>();
+      const int* label_data = label->data<int>();
+
+      // TODO(qingqing): make zero setting a common function.
       memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+
       for (int i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
         int index = i * class_num + label_data[i];
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 52032f3a698005393b6766a941697267daa2bd55..1de514dff487158e0823fd628d9b3b50f36fdd9b 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -21,7 +21,7 @@ class TestCrossEntropyOp1(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": False}
+        self.attrs = {"softLabel": False}
 
     def test_check_output(self):
         self.check_output()
@@ -49,7 +49,7 @@ class TestCrossEntropyOp2(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": True}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()
@@ -73,6 +73,7 @@ class TestCrossEntropyOp3(OpTest):
             0, class_num, (batch_size), dtype="int32")
         label = np.zeros(X.shape)
         label[np.arange(batch_size), label_index] = 1
+
         cross_entropy = np.asmatrix(
             [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
             dtype="float32")
@@ -81,7 +82,7 @@ class TestCrossEntropyOp3(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"soft_label": True}
+        self.attrs = {"softLabel": True}
 
     def test_check_output(self):
         self.check_output()