diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 41d044cdb72b5fb2a7f8654e8ad103778e0857d1..260c8064ac3c996940a0216b21fb473e976af8e2 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -53,6 +53,5 @@ The equation is: Out = X + Y
 }  // namespace paddle
 
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
-typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
-    AddKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
+REGISTER_OP_CPU_KERNEL(
+    add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index 0edf142ee4e5f359ea14be02dbf3f7f8855f6db1..2e5a755f92e4d1fa487152ed453fe3b2823062ed 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,6 +1,5 @@
 #include "paddle/operators/add_op.h"
 #include "paddle/framework/op_registry.h"
 
-typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(add_two,
-                       AddKernel_GPU_float);
\ No newline at end of file
+                       paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 713b2a5dc83d8dd5a3d944101591d75cb19fe04f..7aa63961a0f49aad7dc88360f1d466b401ca83a2 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -57,4 +57,4 @@ The equation is: Out = X * Y
 
 REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 201723df247993c5cc1650edbe4f74441e3217d4..75f00e746c22968815a450dfc1fc8982ebe677c0 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -17,4 +17,4 @@
 
 REGISTER_OP_GPU_KERNEL(mul,
                        paddle::operators::MulKernel<paddle::platform
-                       ::GPUPlace>);
\ No newline at end of file
+                       ::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index ce8a0169e0cbaafb7e90d2227c9597fff463883d..13e5b6a95016a4c3c9be575861b13f131c61d983 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -20,11 +20,22 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
+    dim_pair[0].first = 1;
+    dim_pair[0].second = 0;
+
+    auto input0 = context.Input(0)->Get<framework::Tensor>();
+    auto input1 = context.Input(1)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    output->matrix<T>().device(*(context.GetEigenDevice<Place>())) =
+        input0.matrix<T>().contract(input1.matrix<T>(), dim_pair);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index 414bafd0468033813d50d4d6723e68ee9347eaac..567b058fd07a74014cac728e7d10f6f2f272ef38 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -58,4 +58,4 @@ REGISTER_OP(rowwise_add,
             paddle::operators::RowWiseAddOpMaker);
 REGISTER_OP_CPU_KERNEL(
     rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 2c4bfbf93a1064a47a19c991fa6655b5d67e83cb..58fe96a4a35314909b99a6c2f0442420c6243fc2 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -3,4 +3,4 @@
 
 REGISTER_OP_GPU_KERNEL(
     rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 35f43e6376be6239021e7a9bacb849b93d5226b5..f1d43002dc56ff36a7145795f721422d97d4fdde 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -19,11 +19,24 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class RowWiseAddKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto in0 = context.Input(0)->Get<framework::Tensor>();
+    auto in1 = context.Input(1)->Get<framework::Tensor>();
+    auto* out = context.Output(0)->GetMutable<framework::Tensor>();
+
+    auto input = in0.matrix<T>();
+    auto bias = in1.vec<T>();
+    auto output = out->matrix<T>();
+
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+    Eigen::DSizes<int, 1> one_d(input.size());
+    Eigen::DSizes<int, 1> bcast(rest_size);
+    output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
+        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index 45ae277c538ca90716febaf2f3d92b560149d147..fa13f2c4f74c583aff56f907178c77b4acf5d37e 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -46,4 +46,5 @@ REGISTER_OP(sigmoid,
             paddle::operators::SigmoidOp,
             paddle::operators::SigmoidOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
+    sigmoid,
+    paddle::operators::SigmoidKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index 79d5222348f610b1b016a2df06e8b1e0a4fac66c..59bba2729f7efb31f633dc8448e8b8db51ede49a 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -2,4 +2,4 @@
 #include <paddle/framework/op_registry.h>
 
 REGISTER_OP_GPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 42173343f3e364729ecd190fc554b8c45ecfca8d..7995b75297c7b040f5a000e2d39918ce3b5bd4a0 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -20,11 +20,17 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class SigmoidKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto input = context.Input(0)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    output->flat<T>().device(*(context.GetEigenDevice<Place>())) =
+        1.0 / (1.0 + (-1.0 * input.flat<T>()).exp());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 4ca7be359e210d7a31aef94e498f37a1ad4879a2..42795adbdc83658558d116ba87ffc6bf34a6fc8e 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -23,6 +23,8 @@ protected:
       const std::vector<const framework::Tensor *> &inputs,
       const std::vector<framework::Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
+                   "The input of softmax op must be matrix");
     PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
 
     outputs[0]->set_dims(inputs[0]->dims());
@@ -46,4 +48,5 @@ public:
 namespace ops = paddle::operators;
 
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 903eef1b62231d65e2f9ec7a1f57fca0f4c4605c..730c76a04b58636069431c193bd03584411f2b4a 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -2,4 +2,4 @@
 #include <paddle/operators/softmax_op.h>
 
 REGISTER_OP_GPU_KERNEL(
-    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 74e9e2786b11b9a87cd9700d8458d4e611a8d4bb..34a6c299bbff4da63ff179ae093c0c6866436d06 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -20,11 +20,39 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place>
+template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto input = context.Input(0)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    auto logits = input.matrix<T>();
+    auto softmax = output->matrix<T>();
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits - logits.maximum(along_class)
+                                        .eval()
+                                        .reshape(batch_by_one)
+                                        .broadcast(one_by_class));
+
+    softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
+
+    softmax.device(*(context.GetEigenDevice<Place>())) =
+        (softmax * softmax.sum(along_class)
+                       .inverse()
+                       .eval()
+                       .reshape(batch_by_one)
+                       .broadcast(one_by_class));
   }
 };
 }  // namespace operators