diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..55e3931f870d62dcaddc6c067f66999c59e2a262
--- /dev/null
+++ b/paddle/framework/data_type.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <typeindex>
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+inline DataType ToDataType(std::type_index type) {
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else {
+    PADDLE_THROW("Not supported");
+    return static_cast<DataType>(-1);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 90077d0192421f3678a049a723972fcb1e8d67af..4db38badaea8ae22d9ad47951f4941f3bdeb401a 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -100,13 +100,39 @@ class OpRegistrar : public Registrar {
   }
 };
 
-template <typename PlaceType, typename KernelType>
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
+
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+
+  void operator()(const char* op_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
+                                        PlaceType());
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type) const {}
+};
+
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
   explicit OpKernelRegistrar(const char* op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = PlaceType();
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
+    func(op_type);
   }
 };
 
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 79bda2e2f9173ab632307bc52167d7d8c17d4418..ba697a43e9ebdd1837720098d74b95e2dbad77d3 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "op_info.h"
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/scope.h"
@@ -403,7 +404,7 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
   const Scope& scope_;
 };
 
-class OpKernel {
+class OpKernelBase {
  public:
   /**
    * ExecutionContext is the only parameter of Kernel Run function.
@@ -414,33 +415,47 @@ class OpKernel {
 
   virtual void Compute(const ExecutionContext& context) const = 0;
 
-  virtual ~OpKernel() {}
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
 };
 
 class OperatorWithKernel : public OperatorBase {
  public:
   struct OpKernelKey {
     platform::Place place_;
+    DataType data_type_;
 
-    OpKernelKey() = default;
-    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
-      place_ = dev_ctx.GetPlace();
-    }
+    OpKernelKey(DataType data_type, platform::Place place)
+        : place_(place), data_type_(data_type) {}
+
+    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
+        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
 
     bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_);
+      return platform::places_are_same_class(place_, o.place_) &&
+             data_type_ == o.data_type_;
     }
   };
 
   struct OpKernelHash {
-    std::hash<bool> hash_;
+    std::hash<int> hash_;
     size_t operator()(const OpKernelKey& key) const {
-      return hash_(platform::is_gpu_place(key.place_));
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_);
+      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
+                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
+      return hash_(pre_hash);
     }
   };
 
   using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
+                         OpKernelHash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -451,8 +466,10 @@ class OperatorWithKernel : public OperatorBase {
     RuntimeInferShapeContext infer_shape_ctx(*this, scope);
     this->InferShape(&infer_shape_ctx);
 
-    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
+    ExecutionContext ctx(*this, scope, dev_ctx);
+    auto& opKernel = AllOpKernels().at(type_).at(
+        OpKernelKey(IndicateDataType(ctx), dev_ctx));
+    opKernel->Compute(ctx);
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -462,13 +479,43 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   bool SupportGPU() const override {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
   }
 
  protected:
   virtual void InferShape(InferShapeContextBase* ctx) const = 0;
+
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    auto& scope = ctx.scope();
+    int data_type = -1;
+    for (auto& input : this->inputs_) {
+      for (auto& ipt_name : input.second) {
+        auto* var = scope.FindVar(ipt_name);
+        if (var != nullptr) {
+          const Tensor* t = nullptr;
+          if (var->IsType<Tensor>()) {
+            t = &var->Get<Tensor>();
+          } else if (var->IsType<LoDTensor>()) {
+            t = &var->Get<LoDTensor>();
+          }
+          if (t != nullptr) {
+            int tmp = static_cast<int>(ToDataType(t->type()));
+            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                           "DataType of Paddle Op must be same.");
+            data_type = tmp;
+          }
+        }
+      }
+    }
+    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+    return static_cast<DataType>(data_type);
+  }
 };
 
 }  // namespace framework
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index e1d8f040b837a6ad598351dae0427cc7c231e79f..a0c17b41f27d9ec9a0f8e80576a052617919b000 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,10 +114,13 @@ class OpWithKernelTest : public OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContextBase* ctx) const override {}
+  DataType IndicateDataType(const ExecutionContext& ctx) const override {
+    return DataType::FP32;
+  }
 };
 
 template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel {
+class CPUKernelTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
     std::cout << "this is cpu kernel" << std::endl;
@@ -144,7 +147,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
   }
 };
 
-class CPUKernalMultiInputsTest : public OpKernel {
+class CPUKernalMultiInputsTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
     auto xs = ctx.op().Inputs("xs");
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index f040c09c089ec75c9773d752685be5e232e8f4b7..80a3f0a3935ef6809ebd6f3bfb849d4e87d76d1b 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,20 +29,10 @@ limitations under the License. */
 
 namespace paddle {
 
-namespace pybind {
-namespace details {
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}
-}  // namespace pybind
-
 namespace framework {
 
 class Tensor {
  public:
-  template <bool less, size_t i, typename... args>
-  friend struct pybind::details::CastToPyBufferImpl;
-
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
 
@@ -119,6 +109,8 @@ class Tensor {
     return holder_->place();
   }
 
+  std::type_index type() const { return holder_->type(); }
+
  private:
   template <typename T>
   inline void check_memory_size() const;
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 75e8a989036f0b818687e1fec3e600bb90e86b22..0ca9ef941d4cb15619caea2b6baed197e4b15e5a 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -47,7 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
 }
 
 template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel {
+class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index fe704efe1c979f4fc6a5a37184e51b416f5e517f..12c6b9aac8819caedbc02017cee81b37322bb72a 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -35,7 +35,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class AccuracyKernel : public framework::OpKernel {
+class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* inference = ctx.Input<Tensor>("Inference");
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 15f8afb4ba45cc989fe7576b82b8bf853b1df7de..e400992ae29686d81a5ea32f9c50e05424246707 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T, typename Functor>
-class ActivationKernel : public framework::OpKernel {
+class ActivationKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -36,7 +36,7 @@ class ActivationKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename Functor>
-class ActivationGradKernel : public framework::OpKernel {
+class ActivationGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -202,7 +202,7 @@ struct SquareGradFunctor {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class BReluKernel : public framework::OpKernel {
+class BReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -219,7 +219,7 @@ class BReluKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class BReluGradKernel : public framework::OpKernel {
+class BReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -239,7 +239,7 @@ class BReluGradKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class SoftReluKernel : public framework::OpKernel {
+class SoftReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -256,7 +256,7 @@ class SoftReluKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class SoftReluGradKernel : public framework::OpKernel {
+class SoftReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -277,7 +277,7 @@ class SoftReluGradKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class PowKernel : public framework::OpKernel {
+class PowKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -293,7 +293,7 @@ class PowKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class PowGradKernel : public framework::OpKernel {
+class PowGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -312,7 +312,7 @@ class PowGradKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class STanhKernel : public framework::OpKernel {
+class STanhKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
@@ -329,7 +329,7 @@ class STanhKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class STanhGradKernel : public framework::OpKernel {
+class STanhGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index a7307b6818aa3d10ff215d06281e2b53196fd101..75163032a1ff11a1f18cfd0a4ff7289ff0cb66bf 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class AddKernel : public framework::OpKernel {
+class AddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* input0 = context.Input<Tensor>("X");
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
index ce1d4e1f460414e6e4acee4fa3207f309c55d86b..ac702e9935201ba5263a80ebeb1ab22fa0bd1340 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -56,7 +56,7 @@ class ClipGradFunctor {
 };
 
 template <typename Place, typename T>
-class ClipKernel : public framework::OpKernel {
+class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto max = context.Attr<T>("max");
@@ -73,7 +73,7 @@ class ClipKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class ClipGradKernel : public framework::OpKernel {
+class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto max = context.Attr<T>("max");
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
index b37063261123bce1f22c39ab021e88f2faf58e9f..c113f19fb5cf806709bff845ee0f1078b34014bb 100644
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ConcatKernel : public framework::OpKernel {
+class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -44,7 +44,7 @@ class ConcatKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class ConcatGradKernel : public framework::OpKernel {
+class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index bcf6f758cae561a2e22f5be6c7a242647ef1c144..68c56f531f941e1b8f66ac7ba6bf318881642c4f 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class CosSimKernel : public framework::OpKernel {
+class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
@@ -67,7 +67,7 @@ class CosSimKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class CosSimGradKernel : public framework::OpKernel {
+class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index ac3aeaf41e206c1deb74c7022c36f02c4777a84b..2e72583d68d0acf0e2f5044637dba55de3b57209 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -27,7 +27,7 @@ using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
 template <typename T>
-class CropKernel : public framework::OpKernel {
+class CropKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -69,7 +69,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
 }
 
 template <typename Place, typename T>
-class CropGradKernel : public framework::OpKernel {
+class CropGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 26fc9b51c44d21d92851030449e116538f937846..4b67887f3638f32a89d1a4fd1316c0596b444629 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -47,6 +47,12 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Y", {x_dims[0], 1});
     ctx->ShareLoD("X", /*->*/ "Y");
   }
+
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class CrossEntropyGradientOp : public framework::OperatorWithKernel {
@@ -87,6 +93,12 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
+
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 1cfeb7a53b047541322ac53c5b7249e660039d5c..76d63f77adccb0e7059b5dbe0bbfde1653dae6df 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -53,7 +53,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
 }  // namespace
 
 template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -69,7 +69,7 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 1f67461d3fadb1a979832ad049d4e0098256b834..fa81d3b4310a889dc0b21f6969ab39dddf053186 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class CrossEntropyOpKernel : public framework::OpKernel {
+class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -42,7 +42,7 @@ class CrossEntropyOpKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class CrossEntropyGradientOpKernel : public framework::OpKernel {
+class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index a04e4a22cc09d4e8106a528e490ccf8e90681c08..30c769000f2b98c69eaa78a4c139630dd0956386 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -47,7 +47,7 @@ struct MaskGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename Place, typename T, typename AttrType>
-class GPUDropoutKernel : public framework::OpKernel {
+class GPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index d57f64afcb3558aeea6aed23fae06866e9af874a..745525fe81dadb22cbb64d66203f5a75608d3718 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T, typename AttrType>
-class CPUDropoutKernel : public framework::OpKernel {
+class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -62,7 +62,7 @@ class CPUDropoutKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class DropoutGradKernel : public framework::OpKernel {
+class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(context.Attr<bool>("is_training"),
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index e9f78ef26e05878053d968c35f17b456c128827a..f04fe3ec6069ab1bf227be6a3a5c10ee908e4824 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseAddKernel : public framework::OpKernel {
+class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
@@ -101,7 +101,7 @@ struct ElementwiseAddBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel {
+class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 99b6d9c1991edfb0018f8a459dfa373948cec434..8946ff3d25c2aff3dc3aa69368f0083371cd2fef 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseDivKernel : public framework::OpKernel {
+class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
@@ -103,7 +103,7 @@ struct ElementwiseDivBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel {
+class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index bda5dfe03e974740fe4a07191ae6b68ebfcd5d3a..da7765aa6a7a81c9e0b4f462022cad54c16aec47 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -36,7 +36,9 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
             elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index da08a75596c4d3b89dc8892bd4405464fec96389..056f081d3e6ac349978ff00689700c035bed8e39 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,9 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 6ab642378bb0af8593ca0677014aede3c03cff8e..4469b07eaa08a3b011a88e58f1d645dd30b10ced 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
@@ -102,7 +102,7 @@ struct ElementwiseMulBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel {
+class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 3ca1376c73b3332b76a5973e201f9e4fba77cd21..3f40c1c5bcea5e8473765b039de4ee2a16054f0c 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseSubKernel : public framework::OpKernel {
+class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
@@ -102,7 +102,7 @@ struct ElementwiseSubBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel {
+class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 4474581784531faee1741f0b143743e31cc3788f..cdf56a723b117fe7b08ef2749aa2c2978c923d44 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class FillZerosLikeKernel : public framework::OpKernel {
+class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* output = context.Output<framework::Tensor>("Y");
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 0e3cd174adee1e50d0a63861286a26d325484efb..da22bd0c52c27d7decd10e2e2b34fa38d0620da8 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -37,6 +37,11 @@ class GatherOp : public framework::OperatorWithKernel {
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class GatherGradOp : public framework::OperatorWithKernel {
@@ -47,6 +52,11 @@ class GatherGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContextBase* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 381854f301870beadb72d9e9b4eb17ff199960fb..073e566e8f6962d62cc1b738672843421dcb4ee5 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename Place, typename T>
-class GatherOpKernel : public framework::OpKernel {
+class GatherOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *X = ctx.Input<Tensor>("X");
@@ -37,7 +37,7 @@ class GatherOpKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class GatherGradientOpKernel : public framework::OpKernel {
+class GatherGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *Index = ctx.Input<Tensor>("Index");
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 05120a6e7bcfdb8641c722731f462c89e4223339..5cd2c7d2c066cd31e2d38a3c0d682f02339b4d59 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -16,7 +16,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel {
+class CPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
@@ -56,6 +56,11 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
                    "dims can be one int or array. dims must be set.");
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(Attr<int>("data_type"));
+  }
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -76,6 +81,8 @@ Use to initialize tensor with gaussian random generator.
                  "Random seed of generator."
                  "0 means use system wide seed")
         .SetDefault(0);
+    AddAttr<int>("data_type", "output data type")
+        .SetDefault(framework::DataType::FP32);
   }
 };
 
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 2d63b3049988cfc3135a87a57dad56b970df3eab..315560bf1ba8a66b9a3b7d79510d202885e845d6 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -37,7 +37,7 @@ struct GaussianGenerator {
 };
 
 template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel {
+class GPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/gemm_conv2d_op.h
index 5c9e81732aa72211c2021382cf9a907880c53c17..323e3f7c3bd506c6b63bf4d1152384649f5da575 100644
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/gemm_conv2d_op.h
@@ -25,7 +25,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel {
+class GemmConv2DKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* input = context.Input<Tensor>("Input");
@@ -98,7 +98,7 @@ class GemmConv2DKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel {
+class GemmConvGrad2DKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* input = context.Input<Tensor>("Input");
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 9b1314bfbade8551d98b0fbabb7c2968d7600db5..929008fbcbe03bd6591b0a02252b343c46d00b8f 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -36,6 +36,11 @@ class LookupTableOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
     ctx->ShareLoD("Ids", /*->*/ "Out");
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("W")->type());
+  }
 };
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -69,6 +74,11 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
     auto table_dims = ctx->GetInputDim("W");
     ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("W")->type());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 62f63b4f3c876e084e2468001e8bcb9310d16a82..c3808fa9a8de031fcae3ac0417e8c4330b2f5aad 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -61,7 +61,7 @@ __global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
 }
 
 template <typename T>
-class LookupTableCUDAKernel : public framework::OpKernel {
+class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto table_t = context.Input<Tensor>("W");
@@ -85,7 +85,7 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class LookupTableGradCUDAKernel : public framework::OpKernel {
+class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto ids_t = context.Input<Tensor>("Ids");
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index a1298906dd4b4209644fe06584f70169519de01c..dfead2fc5b25b9be26bb19cd74a3a94daf62cca6 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-class LookupTableKernel : public framework::OpKernel {
+class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto table_t = context.Input<Tensor>("W");      // float tensor
@@ -44,7 +44,7 @@ class LookupTableKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class LookupTableGradKernel : public framework::OpKernel {
+class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto ids_t = context.Input<Tensor>("Ids");
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index 6e5e4978994c281416a65af5f8ffdec688768d63..b1db0d53227148de53b04587b943945f8563346e 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -90,7 +90,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
 }
 
 template <typename T, typename AttrType = T>
-class LstmUnitOpCUDAKernel : public framework::OpKernel {
+class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -121,7 +121,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T, typename AttrType = T>
-class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 683034fe15df8cabfdff5e856adb5c0467055064..0dc9a7d9a7aae2e16bc4488731f572f43778baf8 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -33,7 +33,7 @@ inline T tanh(T x) {
 }
 
 template <typename Place, typename T, typename AttrType = T>
-class LstmUnitKernel : public framework::OpKernel {
+class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -76,7 +76,7 @@ class LstmUnitKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class LstmUnitGradKernel : public framework::OpKernel {
+class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index ce31e178d8e375dc59be80a6c05133201308da70..c99286a5b928f1edcd845b01b21b95654c25db07 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class MeanKernel : public framework::OpKernel {
+class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* input = context.Input<Tensor>("X");
@@ -45,7 +45,7 @@ class MeanKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MeanGradKernel : public framework::OpKernel {
+class MeanGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
index 6310a4fd5141516cff4fc7acbe1d17913a1b5506..bd9a2790aa2b208c2d3dfc792031283eb6c42397 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class MinusKernel : public framework::OpKernel {
+class MinusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* left_tensor = context.Input<framework::Tensor>("X");
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
index bce760f95e72cfec05b07591e0fa1250168b112f..8854e166cd99ce914d7f9f9bcead3234b0649506 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -39,7 +39,7 @@ struct ModifiedHuberLossBackward {
 };
 
 template <typename T>
-class ModifiedHuberLossGradGPUKernel : public framework::OpKernel {
+class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("Y");
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index cb51007749e3c59572d4852959f4119ac377decc..aba75efad9c19e3e113b4f09bc1fbd4732f4e187 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -47,7 +47,7 @@ struct ModifiedHuberLossForward {
 };
 
 template <typename Place, typename T>
-class ModifiedHuberLossKernel : public framework::OpKernel {
+class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
@@ -73,7 +73,7 @@ class ModifiedHuberLossKernel : public framework::OpKernel {
 
 // CPU backward kernel
 template <typename T>
-class ModifiedHuberLossGradCPUKernel : public framework::OpKernel {
+class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("Y");
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index ac7136a76933d1f3ead86518c65d589747227631..684b1ea0c0c8ddabc9809cc05ed985e0cc250955 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class MulKernel : public framework::OpKernel {
+class MulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* x = context.Input<Tensor>("X");
@@ -52,7 +52,7 @@ class MulKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MulGradKernel : public framework::OpKernel {
+class MulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 9896d269ccc86d8fdc3bf6375e44ef5bf3e6b9c7..a069127a19a1d0ba4eaa2b3450a1c46262ace3ed 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -50,6 +50,11 @@ class MultiplexOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", in_dim);
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+  }
 };
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -99,6 +104,11 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 505776612e7119e568493506b113661a839e5bd1..72b1f96eafde37976b4b067b534112b17e02b807 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -21,7 +21,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel {
+class MultiplexGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<Tensor>("X");
@@ -51,7 +51,7 @@ class MultiplexGPUKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel {
+class MultiplexGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index 637c63a34af394f5f54997c46c00a9ff00577476..ab3cafaa324a29d6f249cf1f73db92e1364eebc8 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class MultiplexCPUKernel : public framework::OpKernel {
+class MultiplexCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -48,7 +48,7 @@ class MultiplexCPUKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel {
+class MultiplexGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
index 2cc3b945ae5b2e2e93d8531c7f99e4c215d1d806..9534dbf54529e3b9ae2b6640d51fe291e9521927 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -47,7 +47,7 @@ void PadFunction(const framework::ExecutionContext& context) {
 }
 
 template <typename Place, typename T>
-class PadKernel : public framework::OpKernel {
+class PadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
@@ -97,7 +97,7 @@ void PadGradFunction(const framework::ExecutionContext& context) {
 }
 
 template <typename Place, typename T>
-class PadGradKernel : public framework::OpKernel {
+class PadGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
index 6b78ed295cbac060d816fb3dd27a4b80145cb1ce..5ad31c2203ae6c9bf6f48bb9ecf9a714597e7da8 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -40,7 +40,7 @@ class PReluFunctor {
 };
 
 template <typename Place, typename T>
-class PReluKernel : public framework::OpKernel {
+class PReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -77,7 +77,7 @@ class PReluGradFunctor {
 };
 
 template <typename Place, typename T>
-class PReluGradKernel : public framework::OpKernel {
+class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index 7df195ff47ecfd79388385eed4bd37b8c9b45979..f184d6efcb496a1d7f38540712b6c431f816482e 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class RankLossKernel : public framework::OpKernel {
+class RankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out_t = ctx.Output<framework::Tensor>("Out");
@@ -42,7 +42,7 @@ class RankLossKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class RankLossGradKernel : public framework::OpKernel {
+class RankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_left_t =
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index 2fbf94e34f3961a9b3140fb682a7c479f3b71f4d..ba3f3db81dc6251a063d27e597fd7e486e7b6c14 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -87,7 +87,7 @@ struct MaxOrMinGradFunctor {
 };
 
 template <typename Place, typename T, typename Functor>
-class ReduceKernel : public framework::OpKernel {
+class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
@@ -141,7 +141,7 @@ class ReduceKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename Functor>
-class ReduceGradKernel : public framework::OpKernel {
+class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 873acf30782d390cdca5e7e864c76e1f743f9a7c..628dfe4c0fadcfeec188d8ae5049a994e3281bc1 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ReshapeKernel : public framework::OpKernel {
+class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::Tensor>("Out");
@@ -39,7 +39,7 @@ class ReshapeKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class ReshapeGradKernel : public framework::OpKernel {
+class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index 35774b940926f77167b8f19597027e74d3477e5b..b43e5d868b38350a74ca1a94880990da6d7da0bc 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class RowwiseAddKernel : public framework::OpKernel {
+class RowwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto out = context.Output<Tensor>("Out");
@@ -50,7 +50,7 @@ class RowwiseAddKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class RowwiseAddGradKernel : public framework::OpKernel {
+class RowwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 02fbdc52bbf89c9f2acc5eeaa1197e4ccbca9d31..dc6bc768997f4fdd049bb63bdc11252ab52fcda9 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 template <typename Place, typename T, typename AttrType = T>
-class ScaleKernel : public framework::OpKernel {
+class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
     auto* tensor = context.Output<framework::Tensor>("Out");
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 3fc4a39ebc5526bfed61ba667c3cdc214cdd056c..cadd8841b6ab3a3674054240265eb6d4b474db1e 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -48,6 +48,11 @@ class ScatterOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", ref_dims);
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+  }
 };
 
 class ScatterGradOp : public framework::OperatorWithKernel {
@@ -60,6 +65,11 @@ class ScatterGradOp : public framework::OperatorWithKernel {
                       ctx->GetInputDim("Updates"));
     ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+  }
 };
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index e9595638a86a4a4536ddad4e6f20fd80a54b1608..a8eb54399a932913de208e1ddc90a6ff0dfaa452 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename Place, typename T>
-class ScatterOpKernel : public framework::OpKernel {
+class ScatterOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *Ref = ctx.Input<Tensor>("Ref");
@@ -40,7 +40,7 @@ class ScatterOpKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class ScatterGradientOpKernel : public framework::OpKernel {
+class ScatterGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index cb80586e88f8d9e31b7b91a54f5e05ac6fa73f0f..752d714125578b2d1f926765b183495ec5cc203e 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -38,7 +38,7 @@ enum SeqPoolType {
 };
 
 template <typename Place, typename T>
-class SequencePoolKernel : public framework::OpKernel {
+class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
@@ -85,7 +85,7 @@ class SequencePoolKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class SequencePoolGradKernel : public framework::OpKernel {
+class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index f8888f9c362e1c39af42236bb3a23be37aa3ae15..a3fe3308942f98e2c28376b589b6fc930e6878a1 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SGDOpKernel : public framework::OpKernel {
+class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto param = ctx.Input<Tensor>("param");
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index a6de9043fdbcdcca47407aac0b4892cbad3a9a42..41c619f181c878f08959a8ca461c60af5ffdff2a 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename Place, typename T>
-class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel {
+class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
@@ -48,7 +48,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel {
 
 // dX = sigmoid(X) - labels
 template <typename Place, typename T>
-class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel {
+class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
index 0604fb5e1c2f17c702208520a1d23bd5c3c65b5d..39d0070b6c8909b8f433de48038240e851d9d6cf 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -45,7 +45,7 @@ struct SmoothL1LossForward {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class SmoothL1LossKernel : public framework::OpKernel {
+class SmoothL1LossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
@@ -115,7 +115,7 @@ struct SmoothL1LossBackward {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class SmoothL1LossGradKernel : public framework::OpKernel {
+class SmoothL1LossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("InsideWeight");
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 7220f486be055e1b841a06b15f519717c54f575c..9996536454b1b6b992385787301faa6d66a4cd20 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SoftmaxKernel : public framework::OpKernel {
+class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto X = context.Input<Tensor>("X");
@@ -40,7 +40,7 @@ class SoftmaxKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class SoftmaxGradKernel : public framework::OpKernel {
+class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto Y = context.Input<Tensor>("Y");
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index e2299b254458cdd42dee4683561d4d5c81653fb1..a76489871f30dc8d852b6a783efeff41704fd4a4 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
+#include <paddle/function/TensorType.h>
 
 namespace paddle {
 namespace operators {
@@ -115,6 +116,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("Logits", /*->*/ "Softmax");
     ctx->ShareLoD("Logits", /*->*/ "Loss");
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Logits")->type());
+  }
 };
 
 class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
@@ -149,6 +155,12 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
     ctx->SetOutputDim(framework::GradVarName("Logits"),
                       ctx->GetInputDim("Softmax"));
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 1cf4296dccf68aece6fdfb7910a9c68449633b76..c3086e729e493228e06a176e1a64e5e95fad148b 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -53,7 +53,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
 }  // namespace
 
 template <typename T>
-class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
@@ -73,7 +73,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index bf792c1f59e2e43a98c93bddbc2aa63d646dee6f..a8b18504e1c3a1d617b6040d2c68f24f1cb2787d 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
@@ -47,7 +47,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* out_grad =
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
index 8ab8e0ee4fea621b34da73507c53846100d61a17..fa26e5f677b18c84b45dd583004d02cab4c1d375 100644
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class SplitOpKernel : public framework::OpKernel {
+class SplitOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
index 097ac04fc09a10b3b624f491a847e281e41a802c..259ef4029646914f83a112b9c6d7fdf8401483f6 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SquaredL2DistanceKernel : public framework::OpKernel {
+class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
@@ -68,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class SquaredL2DistanceGradKernel : public framework::OpKernel {
+class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("sub_result");
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 0b1e9ebaa38d455fb5e3ce8c1a39cbbcdad9a940..7e8fbb9e41c694df9169ea583ce47c33d3bcf2bb 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -22,7 +22,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SumKernel : public framework::OpKernel {
+class SumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto ins = context.MultiInput<Tensor>("X");
@@ -43,7 +43,7 @@ class SumKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class SumGradKernel : public framework::OpKernel {
+class SumGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* input = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 53fe505b77bfac8a33803f082f8e935d3ed403b6..7be6932f1e301d06e0e232367a38bfa673ff45be 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -279,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int* indices,
 }
 
 template <typename T>
-class TopkOpCUDAKernel : public framework::OpKernel {
+class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index ef66acc1d569282a42be64b7a5e90f3fbdb20690..4b248faa120bcfd20e70d288cce2d485d3e6371e 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class TopkKernel : public framework::OpKernel {
+class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index ea299dce72ad340b0a65ee50582dc156b5ad7abb..aaa3f47ab5545accd4d1108e0ad6f5a3062186d0 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -38,7 +38,7 @@ void EigenTranspose(const framework::ExecutionContext& context,
 }
 
 template <typename Place, typename T>
-class TransposeKernel : public framework::OpKernel {
+class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<framework::Tensor>("X");
@@ -73,7 +73,7 @@ class TransposeKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class TransposeGradKernel : public framework::OpKernel {
+class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* out_grad =
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 2771df56086ff261728af84edcdf01cda3e45e9f..97b1d0bed4595cb750e4d2122f294f10edfbe0ff 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename T>
-class CPUUniformRandomKernel : public framework::OpKernel {
+class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* tensor = ctx.Output<framework::Tensor>("Out");
@@ -62,6 +62,11 @@ class UniformRandomOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(Attr<int>("data_type"));
+  }
 };
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -80,6 +85,8 @@ Used to initialize tensor with uniform random generator.
                  "Random seed of uniform random. "
                  "0 means generate a seed by system")
         .SetDefault(0);
+    AddAttr<int>("data_type", "output tensor data type")
+        .SetDefault(framework::DataType::FP32);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 6614b53b3f990d10c82633f3c1f079acea0cd827..5612ce9eb1c644d6271b4a9bb949f685848e05c0 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -40,7 +40,7 @@ struct UniformGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename T>
-class GPUUniformRandomKernel : public framework::OpKernel {
+class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index b31515e1f028acac885a506ff1c20479407a05e3..856e54df89c1c18ade040957188a2fbda0901473 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -47,7 +47,7 @@ bool is_cpu_place(const Place &p) {
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
-  return is_gpu_place(p1) == is_gpu_place(p2);
+  return p1.which() == p2.which();
 }
 
 std::ostream &operator<<(std::ostream &os, const Place &p) {
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 1117476bb37f1b0f3876c55e610803d5ee2558ce..0efc6932349a5b3ad295d195a16737a642e18943 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <iostream>
+
 #include "paddle/platform/variant.h"
 
 namespace paddle {
@@ -46,8 +47,18 @@ struct IsGPUPlace : public boost::static_visitor<bool> {
   bool operator()(const GPUPlace &gpu) const { return true; }
 };
 
+// Define the max number of Place in bit length. i.e., the max number of places
+// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
+#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
+
 typedef boost::variant<GPUPlace, CPUPlace> Place;
 
+// static check number of place types is less equal than
+// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
+BOOST_MPL_ASSERT((boost::mpl::less_equal<
+                  Place::types::size,
+                  boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
+
 void set_place(const Place &);
 const Place &get_place();
 
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index c2257af1b5dd1a1e284979bf17e1a947072baa85..16ee00efe7a9b0406f8459e19a55e1e1b9ca7419 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -29,4 +29,6 @@
 #endif
 #endif
 
+#include <boost/mpl/comparison.hpp>
+#include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index f0d5a6f9ff963ecd80d0c261daff56bff50663d4..10621e90eebf5cf197893a548c32d8b67af8e0b6 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -42,7 +42,7 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   py::buffer_info operator()(framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
       std::vector<size_t> strides;
@@ -56,13 +56,13 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         prod *= dims_outside[i - 1];
       }
       framework::Tensor dst_tensor;
-      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+      if (paddle::platform::is_gpu_place(tensor.place())) {
         dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
-      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+      } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
       }
       return py::buffer_info(
-          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()),
           sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
           (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
     } else {