Merge branch 'develop' into memory_load

test=develop

Merge branch 'develop' into memory_load
test=develop
923b1887 · Tao Luo · 405b2486 · b52f5d28 · 923b1887 · 923b1887
101 changed file
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -166,6 +166,8 @@ function(op_library TARGET)
      # Append first implemented MKLDNN activation operator
      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
      else()
        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
      endif()

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -194,6 +194,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
 paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))

--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
+add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -118,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
+cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@@ -191,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

-cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)

 cc_test(tuple_test SRCS tuple_test.cc )

--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 void AllReduceOpHandle::RunImpl() {
  platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);

+// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
+// this is a distributed or inter-process call, find a better way.
+#ifdef PADDLE_WITH_CUDA
+  if (NoDummyInputSize() == 1 &&
+      local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
+#else
  if (NoDummyInputSize() == 1) {
+#endif
    return;  // No need to all reduce when GPU count = 1;
  } else {
    // Wait input done

--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
    auto multi_devices_pass = AppendPass("multi_devices_pass");
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
+    multi_devices_pass->Set<int>("num_trainers",
+                                 new int(strategy_.num_trainers_));

    // Add a graph print pass to record a graph with device info.
    if (!strategy_.debug_graphviz_path_.empty()) {

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -133,6 +133,7 @@ static const char kPlaces[] = "places";
 static const char kParams[] = "params";
 static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";
+static const char kNumTrainers[] = "num_trainers";

 void MultiDevSSAGraphBuilder::Init() const {
  all_vars_.clear();
@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  auto nodes = graph->ReleaseNodes();
  ir::Graph &result = *graph;

+  int num_trainers = Get<int>(kNumTrainers);
+
  for (auto &node : nodes) {
    if (node->IsVar() && node->Var()) {
      all_vars_.emplace(node->Name(), node->Var());
@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
          CreateComputationalOps(&result, node, places_.size());
        }

-        if (!is_forwarding && places_.size() > 1) {
+        if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
          // Currently, we assume that once gradient is generated, it can be
          // broadcast, and each gradient is only broadcast once.
          if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass,
    .RequirePassAttr(paddle::framework::details::kPlaces)
    .RequirePassAttr(paddle::framework::details::kParams)
    .RequirePassAttr(paddle::framework::details::kLocalScopes)
-    .RequirePassAttr(paddle::framework::details::kStrategy);
+    .RequirePassAttr(paddle::framework::details::kStrategy)
+    .RequirePassAttr(paddle::framework::details::kNumTrainers);
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "glog/logging.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace framework {
@@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
  return tensor;
 }

+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) {
+  Variable* var = scope.FindVar(var_name);
+  PADDLE_ENFORCE(var, "%s no in scope", var_name);
+  PADDLE_ENFORCE(var->IsType<LoDTensor>(), "Only support lod tensor now.");
+  return *var->GetMutable<LoDTensor>();
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
 LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                            size_t index);

+LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name);
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) {
    switch (role_id) {
      case _INT(OpRole::kForward):
        if (visit.find(_INT(OpRole::kBackward)) != visit.end()) {
-          LOG(ERROR)
-              << "Cannot add backward operator before forward operator %s."
-              << op->Type();
+          LOG(ERROR) << "Cannot add backward operator before forward operator "
+                     << op->Type();
        }
        break;
      case _INT(OpRole::kBackward):

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -177,14 +177,13 @@ class Graph {
    return nullptr;
  }

-  const ProgramDesc &program() const { return program_; }
-  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
-      const ProgramDesc &program);
-
  void ResolveHazard(
      const std::map<std::string, std::vector<ir::Node *>> &var_nodes);

 private:
+  std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
+      const ProgramDesc &program);
+
  // This method takes ownership of `node`.
  ir::Node *AddNode(ir::Node *node) {
    PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());

--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace framework {
+
+size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
+  int cur_loc = 0;
+
+  int place = key.place_.which();
+  cur_loc += OpKernelType::kPlaceBits;
+
+  int data_type = static_cast<int>(key.data_type_) << cur_loc;
+  cur_loc += OpKernelType::kPrimaryDTypeBits;
+
+  int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
+  cur_loc += OpKernelType::kLayoutBits;
+
+  int library_type = static_cast<int>(key.library_type_) << cur_loc;
+  cur_loc += OpKernelType::kLibBits;
+
+  int customized_value = key.customized_type_value_;
+  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  customized_value = customized_value << cur_loc;
+  cur_loc += OpKernelType::kCustomizeBits;
+  PADDLE_ENFORCE(cur_loc < 64);
+
+  std::hash<int> hasher;
+  return hasher(place + data_type + data_layout + library_type +
+                customized_value);
+}
+
+bool OpKernelType::operator==(const OpKernelType& o) const {
+  return platform::places_are_same_class(place_, o.place_) &&
+         data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+         library_type_ == o.library_type_ &&
+         customized_type_value_ == o.customized_type_value_;
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -24,54 +24,55 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType& key) const {
-      int place = key.place_.which();
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-      int library_type = static_cast<int>(key.library_type_)
-                         << (LEFT_SHIFT * 3);
-
-      std::hash<int> hasher;
-      return hasher(place + data_type + data_layout + library_type);
-    }
-  };
+class OpKernelType {
+ public:
+  constexpr static int kDefaultCustomizedTypeValue = 0;

-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
-
-  proto::VarType::Type data_type_;
-  DataLayout data_layout_;
-  platform::Place place_;
-  LibraryType library_type_;
+  // In total should be smaller than 64.
+  constexpr static int kPlaceBits = 4;
+  constexpr static int kPrimaryDTypeBits = 8;
+  constexpr static int kLayoutBits = 4;
+  constexpr static int kLibBits = 4;
+  constexpr static int kCustomizeBits = 4;

  OpKernelType(proto::VarType::Type data_type, platform::Place place,
               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
      : data_type_(data_type),
        data_layout_(data_layout),
        place_(place),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}

  OpKernelType(proto::VarType::Type data_type,
               const platform::DeviceContext& dev_ctx,
               DataLayout data_layout = DataLayout::kAnyLayout,
-               LibraryType library_type = LibraryType::kPlain)
+               LibraryType library_type = LibraryType::kPlain,
+               int customized_type_value = kDefaultCustomizedTypeValue)
      : data_type_(data_type),
        data_layout_(data_layout),
        place_(dev_ctx.GetPlace()),
-        library_type_(library_type) {}
+        library_type_(library_type),
+        customized_type_value_(customized_type_value) {}
+
+  virtual ~OpKernelType() {}
+
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const;
+  };

  size_t hash_key() const { return Hash()(*this); }

-  bool operator==(const OpKernelType& o) const {
-    return platform::places_are_same_class(place_, o.place_) &&
-           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
-           library_type_ == o.library_type_;
-  }
+  bool operator==(const OpKernelType& o) const;

  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+
+  proto::VarType::Type data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+  int customized_type_value_;
 };

 inline std::ostream& operator<<(std::ostream& os,

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -35,6 +35,7 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
+
 class Registrar {
 public:
  // In our design, various kinds of classes, e.g., operators and kernels,
@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor;

 template <typename PlaceType, typename T, typename Func>
 inline void RegisterKernelClass(const char* op_type, const char* library_type,
-                                Func func) {
+                                int customized_type_value, Func func) {
  std::string library(library_type);
  std::string data_layout = "ANYLAYOUT";
  if (library == "MKLDNN") {
@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type,
  }
  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
                   StringToDataLayout(data_layout),
-                   StringToLibraryType(library_type));
+                   StringToLibraryType(library_type), customized_type_value);
  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
 }

@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
  using KERNEL_TYPE =
      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;

-  void operator()(const char* op_type, const char* library_type) const {
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {
    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
    RegisterKernelClass<PlaceType, T>(
-        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+        op_type, library_type, customized_type_value,
+
+        [](const framework::ExecutionContext& ctx) {
          KERNEL_TYPE().Compute(ctx);
        });
    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
        func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };

 template <typename PlaceType, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {}
 };

 // User can register many kernel in one place. The data type could be
@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
 public:
-  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type,
+                             int customized_type_value) {
    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };

@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx;
 template <typename PlaceType, typename... DataTypeAndKernelType>
 class OpKernelRegistrarEx : public Registrar {
 public:
-  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type,
+                               int customized_type_value) {
    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
        func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };

 template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
 struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
                                  DataTypeAndKernelType...> {
-  void operator()(const char* op_type, const char* library_type) const {}
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {}
 };

 template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      typename std::tuple_element<I,
                                  std::tuple<DataTypeAndKernelType...>>::type;

-  void operator()(const char* op_type, const char* library_type) const {
-    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+  void operator()(const char* op_type, const char* library_type,
+                  int customized_type_value) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type,
+                                      customized_type_value, Functor());

    constexpr auto size =
        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
                               DataTypeAndKernelType...>
        func;
-    func(op_type, library_type);
+    func(op_type, library_type, customized_type_value);
  }
 };

+// clang-format off
 /**
 * check if MACRO is used in GLOBAL NAMESPACE.
 */
@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 /**
 * Macro to register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##library_type##__,                      \
-      "REGISTER_OP_KERNEL must be called in global namespace");            \
-  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
-                                                           #library_type); \
-  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
-    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
-    return 0;                                                              \
+#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type,             \
+                                            place_class, customized_name,      \
+                                            customized_type_value, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__,      \
+                                 "REGISTER_OP_KERNEL must be called in "       \
+                                 "global namespace");                          \
+  static ::paddle::framework::OpKernelRegistrar<place_class,                   \
+                                                __VA_ARGS__>                   \
+      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
+          #op_type, #library_type, customized_type_value);                     \
+  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
+    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
+        .Touch();                                                              \
+    return 0;                                                                  \
  }

+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)   \
+  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(                                \
+      op_type, library_type, place_class, DEFAULT_TYPE,               \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)

 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

-#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      __reg_op_kernel_##op_type##_##library_type##__,                       \
-      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
-  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
-      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
-                                                           #library_type);  \
-  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
-    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
-    return 0;                                                               \
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
+                              customized_name,                     \
+                              customized_type_value,               \
+                              ...)                                 \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                  \
+      __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
+                                 "REGISTER_OP_KERNEL_EX must be called in "  \
+                                 "global namespace");  \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class,  \
+                                                  __VA_ARGS__>  \
+      __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
+          #op_type, #library_type, customized_type_value);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
+    __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__   \
+        .Touch();                                                              \
+    return 0;                                                                  \
  }

 #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
-  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
-                        __VA_ARGS__)
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)

-#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
-  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)

 /**
 * Macro to mark what Operator and Kernel
@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
  extern int TouchOpRegistrar_##op_type();                 \
  UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()

-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
-      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
-      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
-  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
-  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \
-      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
+                                              LIBRARY_TYPE,                \
+                                              customized_name)             \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__,  \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");                 \
+  extern int                                                               \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
+
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
+  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)

 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \
  USE_OP_KERNEL(op_type)
+// clang-format off

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
    AddAttr<float>("scale", "scale of cosine op");
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
    AddComment("This is test op");
  }
 };
@@ -95,6 +97,8 @@ TEST(OperatorBase, all) {
 namespace paddle {
 namespace framework {

+static int special_type_value = 1;
+
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  void Make() {
@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .GreaterThan(0.0);
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
    AddComment("This is test op");
  }
 };

 static int cpu_kernel_run_num = 0;
+static int cpu_kernel2_run_num = 0;

 class OpWithKernelTest : public OperatorWithKernel {
 public:
@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {}
  OpKernelType GetExpectedKernelType(
      const ExecutionContext& ctx) const override {
-    return OpKernelType(proto::VarType::FP32, ctx.GetPlace());
+    int sub_type = ctx.Attr<int>("kernel_sub_type");
+    return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
+                        framework::DataLayout::kAnyLayout,
+                        framework::LibraryType::kPlain, sub_type);
  }
 };

@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> {
  }
 };

+template <typename T1, typename T2>
+class CPUKernel2Test : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+    cpu_kernel2_run_num++;
+    ASSERT_EQ(ctx.op().Input("x"), "IN1");
+    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
+  }
+};
+
 class OpKernelTestMultiInputsProtoAndCheckerMaker
    : public OpProtoAndCheckerMaker {
 public:
@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .GreaterThan(0.0);
+    AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
+        .SetDefault(0);
    AddComment("This is test op");
  }
 };
@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> {
 REGISTER_OP_WITHOUT_GRADIENT(
    op_with_kernel, paddle::framework::OpWithKernelTest,
    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+
 REGISTER_OP_CPU_KERNEL(op_with_kernel,
                       paddle::framework::CPUKernelTest<float, float>);

+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME,
+    paddle::framework::special_type_value,
+    paddle::framework::CPUKernel2Test<float, float>);
+
 // test with single input
 TEST(OpKernel, all) {
  paddle::framework::InitDevices(true);
@@ -211,7 +240,19 @@ TEST(OpKernel, all) {
  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
  op->Run(scope, cpu_place);
+  // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called.
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0);
+
+  attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("kernel_sub_type");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+  auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op2->Run(scope, cpu_place);
+  // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called.
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+  ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1);
 }

 REGISTER_OP_WITHOUT_GRADIENT(

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
+cc_library(layer SRCS layer.cc DEPS proto_desc operator)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc)
+cc_library(engine SRCS engine.cc)
--- a/paddle/fluid/imperative/engine.cc
+++ b/paddle/fluid/imperative/engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/engine.h"
+
+#include <mutex>  // NOLINT
+#include <vector>
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace imperative {
+
+static std::once_flag init_engine;
+static Engine* engine;
+
+class DummyEngine : public Engine {
+ public:
+  void Enqueue(Runnable* runnable) override {
+    queued_runnables_.push_back(runnable);
+  }
+
+  size_t Size() const override { return queued_runnables_.size(); }
+
+  void Sync() override {
+    for (Runnable* l : queued_runnables_) {
+      LOG(INFO) << "running " << reinterpret_cast<void*>(l);
+    }
+    queued_runnables_.clear();
+  }
+
+ private:
+  std::vector<Runnable*> queued_runnables_;
+};
+
+Engine* GetEngine() {
+  std::call_once(init_engine, []() { engine = new DummyEngine(); });
+  return engine;
+}
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/engine.h
+++ b/paddle/fluid/imperative/engine.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace paddle {
+namespace imperative {
+
+struct Runnable {};
+
+class Engine {
+ public:
+  virtual ~Engine() {}
+
+  virtual void Enqueue(Runnable* runnable) = 0;
+
+  virtual size_t Size() const = 0;
+
+  virtual void Sync() = 0;
+};
+
+Engine* GetEngine();
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/layer.h"
+#include <deque>
+#include <limits>
+#include <map>
+#include <random>
+#include <utility>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace imperative {
+
+using framework::Variable;
+
+void AddTo(Variable* src, Variable* dst) {
+  framework::LoDTensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::LoDTensor* src_tensor = src->GetMutable<framework::LoDTensor>();
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld",
+                 dst_tensor->numel(), src_tensor->numel());
+  float* dst_data = dst_tensor->mutable_data<float>(platform::CPUPlace());
+  const float* src_data = src_tensor->data<float>();
+  for (size_t i = 0; i < src_tensor->numel(); ++i) {
+    dst_data[i] += src_data[i];
+  }
+}
+
+class Autograd {
+ public:
+  explicit Autograd(framework::Scope* scope) : scope_(scope) {}
+
+  void RunBackward(VarBase* var) {
+    PADDLE_ENFORCE(var->pre_op_->op_desc_);
+    // TODO(panyx0718): Only create for vars that "require_grad"
+    (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_;
+
+    std::deque<OpBase*> ready;
+    ready.push_back(var->pre_op_);
+
+    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->pre_op_);
+
+    while (!ready.empty()) {
+      OpBase* ready_op = ready.front();
+      ready.pop_front();
+      std::vector<Variable*> input_grads = ready_op->ApplyGrad(scope_);
+
+      for (size_t i = 0; i < input_grads.size(); ++i) {
+        if (!input_grads[i]) continue;
+        OpBase* pre_op = ready_op->pre_ops_->at(i);
+        if (!pre_op) continue;
+
+        dep_counts[pre_op] -= 1;
+        PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+        bool pre_op_ready = dep_counts[pre_op] == 0;
+        if (pre_op_ready) {
+          ready.push_back(pre_op);
+        }
+      }
+    }
+  }
+
+ private:
+  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
+    std::map<OpBase*, int> ret;
+
+    std::deque<OpBase*> queue;
+    queue.push_back(op);
+    std::unordered_set<OpBase*> visited;
+    visited.insert(op);
+    while (!queue.empty()) {
+      OpBase* candidate = queue.front();
+      queue.pop_front();
+      for (OpBase* pre_op : *(candidate->pre_ops_)) {
+        if (!pre_op) continue;
+        if (visited.find(pre_op) == visited.end()) {
+          visited.insert(pre_op);
+          queue.push_back(pre_op);
+        }
+        ret[pre_op] += 1;
+      }
+    }
+
+    return ret;
+  }
+
+  framework::Scope* scope_;
+};
+
+framework::Variable* CreateVariable(const std::string& name,
+                                    const framework::DDim& dim, float val,
+                                    framework::Scope* scope,
+                                    bool random_name = true) {
+  std::string varname = name;
+  if (random_name) {
+    std::mt19937 rng;
+    rng.seed(std::random_device()());
+    std::uniform_int_distribution<std::mt19937::result_type> dist6(
+        1, std::numeric_limits<int>::max());
+    int id = dist6(rng);
+    varname = string::Sprintf("%s@%d", varname, id);
+  }
+
+  VLOG(3) << "creating var " << varname;
+  framework::Variable* var = scope->Var(varname);
+  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+
+  float* data = tensor->mutable_data<float>(dim, platform::CPUPlace());
+  std::fill(data, data + tensor->numel(), val);
+  return var;
+}
+
+framework::LoDTensor& VarBase::Grad() {
+  VLOG(3) << "get var grad " << var_desc_->Name();
+  return *grads_->GetMutable<framework::LoDTensor>();
+}
+
+void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) {
+  VLOG(3) << "apply var grad " << var_desc_->Name() << " "
+          << grad->Get<framework::LoDTensor>().data<float>()[0];
+  if (!grads_) {
+    grads_ =
+        CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()),
+                       var_->Get<framework::LoDTensor>().dims(), 0.0, scope);
+  }
+  AddTo(grad, grads_);
+  VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " "
+          << grads_->Get<framework::LoDTensor>().data<float>()[0];
+}
+
+std::vector<Variable*> OpBase::ApplyGrad(framework::Scope* scope) {
+  VLOG(3) << "op grad " << grad_op_desc_->Type();
+
+  for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) {
+    if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) {
+      // grad op inputs can be forward inputs, so not in grad_to_var.
+      continue;
+    }
+    VLOG(3) << "op grad in var " << grad_invar;
+    block_->FindRecursiveOrCreateVar(grad_invar);
+    framework::Variable* var = scope->Var(grad_invar);
+    const std::string& invar = grad_to_var_->at(grad_invar);
+    for (VarBase* varbase : *output_vars_) {
+      // Use the accumulated grads_ by sharing the input with grads_.
+      if (varbase->var_desc_->Name() == invar) {
+        var->GetMutable<framework::LoDTensor>()->ShareDataWith(
+            varbase->grads_->Get<framework::LoDTensor>());
+        break;
+      }
+    }
+  }
+
+  for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+    VLOG(3) << "grad outvar " << outvar;
+    block_->FindRecursiveOrCreateVar(outvar);
+    framework::Variable* var = scope->Var(outvar);
+    if (!var->IsInitialized()) {
+      framework::VarDesc* var_desc = block_->FindVar(outvar);
+      if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+        var->GetMutable<framework::LoDTensor>();
+      } else {
+        LOG(ERROR) << "tracer doesn't support yet";
+      }
+    }
+  }
+  grad_op_desc_->InferShape(*block_);
+  grad_op_desc_->InferVarType(block_);
+  std::unique_ptr<framework::OperatorBase> opbase =
+      framework::OpRegistry::CreateOp(*grad_op_desc_);
+
+  opbase->Run(*scope, platform::CPUPlace());
+
+  // `ret` matches exactly with `input_vars_` of forward op.
+  std::vector<Variable*> ret;
+  for (size_t i = 0; i < input_vars_->size(); ++i) {
+    bool found = false;
+    for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) {
+      Variable* var = scope->FindVar(outvar);
+      VarBase* origin_var = (*input_vars_)[i];
+      std::string orig_var = grad_to_var_->at(outvar);
+      PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var);
+      VLOG(3) << "apply grad " << outvar << " with origin " << orig_var;
+      origin_var->ApplyGrad(scope, var);
+      found = true;
+      ret.push_back(var);
+      // TODO(panyx0718): There might be another outvar with the same name.
+      // In that case, it doesn't matter the first one or the second one is
+      // used.
+      break;
+    }
+    if (!found) {
+      ret.push_back(nullptr);
+    }
+  }
+  return ret;
+}
+
+void VarBase::RunBackward(framework::Scope* scope) {
+  grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()),
+                          var_->Get<framework::LoDTensor>().dims(), 1.0, scope,
+                          false);
+  if (!pre_op_) return;
+  Autograd(scope).RunBackward(this);
+}
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace imperative {
+
+class OpBase;
+
+class VarBase {
+ public:
+  VarBase()
+      : pre_op_(nullptr),
+        pre_op_out_idx_(-1),
+        var_desc_(nullptr),
+        var_(nullptr),
+        grads_(nullptr) {}
+
+  virtual ~VarBase() {}
+
+  void ApplyGrad(framework::Scope* scope, framework::Variable* grad);
+
+  void RunBackward(framework::Scope* scope);
+
+  framework::LoDTensor& Grad();
+
+  OpBase* pre_op_;
+  int pre_op_out_idx_;
+
+  framework::VarDesc* var_desc_;
+  framework::Variable* var_;
+  framework::Variable* grads_;
+};
+
+class OpBase {
+ public:
+  OpBase()
+      : input_vars_(new std::vector<VarBase*>()),
+        output_vars_(new std::vector<VarBase*>()),
+        pre_ops_(new std::vector<OpBase*>()),
+        pre_ops_out_idx_(new std::vector<int>()),
+        op_desc_(nullptr),
+        grad_op_desc_(nullptr) {}
+
+  virtual ~OpBase() {
+    delete input_vars_;
+    delete output_vars_;
+
+    delete pre_ops_;
+    delete pre_ops_out_idx_;
+
+    if (grad_op_desc_) delete grad_op_desc_;
+    if (grad_to_var_) delete grad_to_var_;
+  }
+
+  std::vector<framework::Variable*> ApplyGrad(framework::Scope* scope);
+
+  std::vector<VarBase*>* input_vars_;
+  std::vector<VarBase*>* output_vars_;
+  std::vector<OpBase*>* pre_ops_;
+  std::vector<int>* pre_ops_out_idx_;
+  framework::OpDesc* op_desc_;
+
+  framework::OpDesc* grad_op_desc_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+  framework::BlockDesc* block_;
+};
+
+class Layer {
+ public:
+  virtual ~Layer() {}
+
+  virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> vars;
+    return vars;
+  }
+
+  virtual void Backward() { LOG(ERROR) << "To support customize"; }
+};
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace paddle {
+namespace imperative {}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/engine.h"
+#include "paddle/fluid/imperative/layer.h"
+
+namespace paddle {
+namespace imperative {
+
+void CreateGradOp(const framework::OpDesc& op_desc,
+                  const std::unordered_set<std::string>& no_grad_set,
+                  const std::vector<framework::BlockDesc*>& grad_sub_block,
+                  framework::OpDesc** grad_op_desc,
+                  std::unordered_map<std::string, std::string>* grad_to_var) {
+  std::vector<std::unique_ptr<framework::OpDesc>> grad_op_descs =
+      framework::OpInfoMap::Instance()
+          .Get(op_desc.Type())
+          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now.");
+  // TODO(panyx0718): Leak?
+  *grad_op_desc = grad_op_descs[0].release();
+}
+
+class Tracer {
+ public:
+  explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+    root_scope_ = new framework::Scope();
+    scopes_[root_block_] = root_scope_;
+  }
+
+  virtual ~Tracer() { delete root_scope_; }
+
+  void Trace(OpBase* op, const std::vector<VarBase*>& inputs,
+             const std::vector<VarBase*>& outputs,
+             framework::BlockDesc* block) {
+    framework::Scope* scope = GetScope(block);
+    framework::OpDesc* op_desc = op->op_desc_;
+    VLOG(3) << "tracer tracing " << op_desc->Type();
+    op_desc->InferShape(*block);
+    op_desc->InferVarType(block);
+    std::unique_ptr<framework::OperatorBase> op_base =
+        framework::OpRegistry::CreateOp(*op_desc);
+
+    *op->input_vars_ = inputs;
+    for (VarBase* input : inputs) {
+      const std::string vname = input->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      input->var_ = var;
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      if (input->pre_op_) {
+        op->pre_ops_->push_back(input->pre_op_);
+        op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_);
+      } else {
+        op->pre_ops_->push_back(nullptr);
+      }
+    }
+
+    *op->output_vars_ = outputs;
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      const std::string vname = outputs[i]->var_desc_->Name();
+      framework::Variable* var = scope->Var(vname);
+      if (!var->IsInitialized()) {
+        framework::VarDesc* var_desc = block->FindVar(vname);
+        if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) {
+          var->GetMutable<framework::LoDTensor>();
+        } else {
+          LOG(ERROR) << "tracer doesn't support yet";
+        }
+      }
+      outputs[i]->var_ = var;
+      outputs[i]->pre_op_ = op;
+      outputs[i]->pre_op_out_idx_ = i;
+    }
+    op_base->Run(*scope, platform::CPUPlace());
+    framework::OpDesc* grad_op_desc;
+    auto grad_to_var = new std::unordered_map<std::string, std::string>();
+    CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var);
+    op->grad_op_desc_ = grad_op_desc;
+    op->grad_to_var_ = grad_to_var;
+    op->block_ = block;
+  }
+
+  framework::Scope* GetScope(framework::BlockDesc* block) {
+    if (scopes_.find(block) != scopes_.end()) {
+      return scopes_.at(block);
+    }
+    framework::BlockDesc* parent_block = block->ParentBlock();
+    PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end());
+    framework::Scope* scope = &scopes_[parent_block]->NewScope();
+    scopes_[block] = scope;
+    return scope;
+  }
+
+ private:
+  std::map<framework::BlockDesc*, framework::Scope*> scopes_;
+  framework::BlockDesc* root_block_;
+  framework::Scope* root_scope_;
+};
+
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    output_mapping.push_back(output_name_map[name]);
  }

-  *block_desc.Proto()->mutable_vars() =
-      const_cast<framework::ProgramDesc *>(&graph->program())
-          ->Proto()
-          ->blocks(0)
-          .vars();
+  auto *vars = block_desc.Proto()->mutable_vars();
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      *vars->Add() = *node->Var()->Proto();
+    }
+  }
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
  PADDLE_ENFORCE(!output_mapping.empty());

--- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc
@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) {
 }  // namespace inference
 }  // namespace paddle

-// USE_OP(prelu);
-USE_CPU_ONLY_OP(prelu);
+USE_OP(prelu);
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
           avg_pool_op_plugin.cu
-           DEPS enforce tensorrt_engine)
+           DEPS enforce tensorrt_engine prelu)
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -14,92 +14,16 @@

 #include <stdio.h>
 #include <cassert>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+#include "paddle/fluid/operators/math/prelu.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {

-static const int CUDA_NUM_THREADS = 1024;
-static const int CUDA_MAX_NUM_BLOCKS = 65535;
-inline static int GET_NUM_BLOCKS(const int N) {
-  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-}
-
-__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
-                                       float *output, int channel,
-                                       size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  float *out = output + offset;
-  float scale = alpha[blockIdx.x % channel];
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-
-__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
-                                       float *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  const float *scale = alpha + offset;
-  float *out = output + offset;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale[i] * x;
-  }
-}
-
-__global__ void PReluScalarKernel(const float *input, const float *alpha,
-                                  float *output, size_t spatial_size) {
-  size_t offset = blockIdx.x * spatial_size;
-  const float *in = input + offset;
-  float scale = *alpha;
-  float *out = output + offset;
-
-  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
-    float x = in[i];
-    out[i] = (x > 0) ? x : scale * x;
-  }
-}
-
-static inline void PReluChannelWise(cudaStream_t stream, const float *input,
-                                    const float *alpha, float *output,
-                                    int batch_size,
-                                    const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, dims.d[0], spatial_size);
-}
-
-static inline void PReluElementWise(cudaStream_t stream, const float *input,
-                                    const float *alpha, float *output,
-                                    int batch_size,
-                                    const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
-static inline void PReluScalar(cudaStream_t stream, const float *input,
-                               const float *alpha, float *output,
-                               int batch_size, const nvinfer1::Dims &dims) {
-  size_t unroll = batch_size * dims.d[0];
-  size_t spatial_size = dims.d[1] * dims.d[2];
-  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
-  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
-      input, alpha, output, spatial_size);
-}
-
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims *inputDims,
                                                int nbInputs) {
@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
  return output_dims;
 }

-int PReluPlugin::enqueue(int batchSize, const void *const *inputs,
+int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
                         void **outputs, void *workspace, cudaStream_t stream) {
  // input dims is CHW.
  const auto &input_dims = this->getInputDims(0);
  const float *input = reinterpret_cast<const float *>(inputs[0]);
  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
  float *output = reinterpret_cast<float **>(outputs)[0];
+
+  std::vector<int> input_shape;
+  input_shape.push_back(batch_size);
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape.push_back(input_dims.d[i]);
+  }
+
  if (mode_ == "channel") {
-    PReluChannelWise(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
+        prelu_channel_wise;
+    prelu_channel_wise(stream, input, alpha, output, input_shape);
  } else if (mode_ == "element") {
-    PReluElementWise(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluElementWiseDirectCUDAFunctor<float>
+        prelu_element_wise;
+    prelu_element_wise(stream, input, alpha, output, input_shape);
  } else {
-    PReluScalar(stream, input, alpha, output, batchSize, input_dims);
+    operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
+    prelu_scalar(stream, input, alpha, output, input_shape);
  }
  return cudaGetLastError() != cudaSuccess;
 }

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -14,11 +14,13 @@

 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include <string>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/split.h"

 DEFINE_bool(init_allocated_mem, false,
            "It is a mistake that the values of the memory allocated by "
@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
  static std::once_flag init_flag;
  static detail::BuddyAllocator **a_arr = nullptr;
+  static std::vector<int> devices;

  std::call_once(init_flag, [gpu_id]() {
-    int gpu_num = platform::GetCUDADeviceCount();
-    PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id,
-                   gpu_num);
+    devices = platform::GetSelectedDevices();
+    int gpu_num = devices.size();

    a_arr = new BuddyAllocator *[gpu_num];
-    for (int i = 0; i < gpu_num; i++) {
+    for (size_t i = 0; i < devices.size(); ++i) {
+      int dev_id = devices[i];
      a_arr[i] = nullptr;
-      platform::SetDeviceId(i);
-      a_arr[i] = new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+      platform::SetDeviceId(dev_id);
+      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                        new detail::GPUAllocator(dev_id)),
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());

      VLOG(10) << "\n\nNOTE: each GPU device use "
               << FLAGS_fraction_of_gpu_memory_to_use * 100
@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
  });

  platform::SetDeviceId(gpu_id);
-  return a_arr[gpu_id];
+  auto pos = std::distance(devices.begin(),
+                           std::find(devices.begin(), devices.end(), gpu_id));
+  return a_arr[pos];
 }
 #endif


--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -70,7 +70,7 @@ endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
 endif()

 # FIXME(typhoonzero): operator deps may not needed.

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
  }
 #endif
  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
-      ctx.GetPlace(), layout, library);
+      framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout,
+      library);
 }

 class ActivationOp : public framework::OperatorWithKernel {

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = {
    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
 };

+/* The following operator can be used to process SelectedRows, because the
+ * output of those operator for zero is zero too.
+ */
+static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
+    "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
+
 static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }

 template <typename DeviceContext, typename Functor>
@@ -50,16 +56,38 @@ class ActivationKernel
  using T = typename Functor::ELEMENT_TYPE;

  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
-                          "Cannot get input tensor X, variable name = %s",
-                          context.op().Input("X"));
-
-    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
-                            "Cannot get output tensor Out, variable name = %s",
-                            context.op().Output("Out"));
-    Out.mutable_data<T>(context.GetPlace());
+    auto x_var = context.InputVar("X");
+    auto out_var = context.OutputVar("Out");
+    PADDLE_ENFORCE(x_var != nullptr,
+                   "Cannot get input Variable X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get output Variable Out, variable name = %s",
+                   context.op().Output("Out"));
+
+    framework::Tensor X, *Out;
+
+    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+      X = detail::Ref(
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var),
+          "Cannot get input Tensor X, variable name = %s",
+          context.op().Input("X"));
+      Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          out_var);
+    } else {
+      X = detail::Ref(context.Input<framework::Tensor>("X"),
+                      "Cannot get input Tensor X, variable name = %s",
+                      context.op().Input("X"));
+      Out = context.Output<framework::Tensor>("Out");
+    }
+
+    PADDLE_ENFORCE(Out != nullptr,
+                   "Cannot get output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+
+    Out->mutable_data<T>(context.GetPlace());
    auto x = framework::EigenVector<T>::Flatten(X);
-    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
@@ -78,14 +106,54 @@ class ActivationGradKernel
 public:
  using T = typename Functor::ELEMENT_TYPE;
  void Compute(const framework::ExecutionContext& context) const override {
-    auto* Out = context.Input<framework::Tensor>("Out");
-    auto* dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto out_var = context.InputVar("Out");
+    auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
+    auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get input Variable Out, variable name = %s",
+                   context.op().Input("Out"));
+    PADDLE_ENFORCE(out_grad_var != nullptr,
+                   "Cannot get input Variable %s, variable name = %s",
+                   framework::GradVarName("Out"),
+                   context.op().Input(framework::GradVarName("Out")));
+    PADDLE_ENFORCE(x_grad_var != nullptr,
+                   "Cannot get output Variable %s, variable name = %s",
+                   framework::GradVarName("X"),
+                   context.op().Output(framework::GradVarName("X")));
+
+    framework::Tensor Out, dOut, *dX;
+    if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+      Out = detail::Ref(
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var),
+          "Cannot get input Tensor Out, variable name = %s",
+          context.op().Input("Out"));
+      dOut =
+          detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
+                          *out_grad_var),
+                      "Cannot get input Tensor %s, variable name = %s",
+                      framework::GradVarName("Out"),
+                      context.op().Input(framework::GradVarName("Out")));
+      dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
+          x_grad_var);
+    } else {
+      Out = detail::Ref(context.Input<framework::Tensor>("Out"),
+                        "Cannot get input Tensor Out, variable name = %s",
+                        context.op().Input("Out"));
+      dOut = detail::Ref(
+          context.Input<framework::Tensor>(framework::GradVarName("Out")),
+          "Cannot get input Tensor %s, variable name = %s",
+          framework::GradVarName("Out"),
+          context.op().Input(framework::GradVarName("Out")));
+      dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    }
+    PADDLE_ENFORCE(dX != nullptr,
+                   "Cannot get output tensor %s, variable name = %s",
+                   framework::GradVarName("X"),
+                   context.op().Output(framework::GradVarName("X")));
    dX->mutable_data<T>(context.GetPlace());

-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto dout = framework::EigenVector<T>::Flatten(dOut);
+    auto out = framework::EigenVector<T>::Flatten(Out);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
@@ -96,8 +164,19 @@ class ActivationGradKernel
    }
    bool inplace = functor.Inplace();
    if (!inplace) {
-      auto* X = context.Input<framework::Tensor>("X");
-      auto x = framework::EigenVector<T>::Flatten(*X);
+      auto x_var = context.InputVar("X");
+      PADDLE_ENFORCE(x_var != nullptr,
+                     "Cannot get input tensor X, variable name = %s",
+                     context.op().Input("X"));
+      framework::Tensor X;
+      if (CanBeUsedBySelectedRows.count(context.op().Type())) {
+        X = detail::Ref(
+            paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var));
+      } else {
+        X = detail::Ref(context.Input<framework::Tensor>("X"));
+      }
+
+      auto x = framework::EigenVector<T>::Flatten(X);
      functor(*place, x, out, dout, dx);
    } else {
      VLOG(10) << " Inplace activation ";

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
  if (bias) {
-    math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y);
-    math::vec_relu<T, platform::jit::avx>(n, y, y);
+    math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
+    math::vec_relu<T, platform::avx>(n, y, y);
  } else {
-    math::vec_relu<T, platform::jit::avx>(n, x, y);
+    math::vec_relu<T, platform::avx>(n, x, y);
  }
 }

@@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) {
  for (int i = 1; i < n; ++i) {
    scalar = scalar < x[i] ? x[i] : scalar;
  }
-  math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y);  // sub
-  math::vec_exp<T>(n, y, y);                                    // exp
+  math::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
+  math::vec_exp<T>(n, y, y);                               // exp
  // sum
  scalar = T(0);
  for (int i = 0; i < n; ++i) {
@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
+    if (platform::MayIUse(platform::avx)) {
+      math::VecActivations<T, platform::avx> act_functor;
      act_gate = act_functor(act_gate_str);
      act_cell = act_functor(act_cell_str);
      act_cand = act_functor(act_cand_str);
    } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      math::VecActivations<T, platform::isa_any> act_functor;
      act_gate = act_functor(act_gate_str);
      act_cell = act_functor(act_cell_str);
      act_cand = act_functor(act_cand_str);

--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {

    auto x_dims = framework::vectorize(input->dims());
    auto f_dims = framework::vectorize(filter->dims());
-    if (activation == "identity") {
-      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
-      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
-      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-    } else if (!exhaustive_search) {
+    if (!exhaustive_search) {
      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");

-    // ------------------- cudnn conv+bias+act forward --------------------
-    ScalingParamType<T> alpha1 = 1.0f;
-    ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
-    auto cudnn_func = [&](void* cudnn_workspace) {
-      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
-          handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
-          filter_data, cudnn_conv_desc, algo, cudnn_workspace,
-          workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
-          cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+    if ((activation == "identity") &&
+        (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
+        (!residual)) {
+      // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
+      // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
+      // But test in some case, the speed is slower, change to use
+      // cudnnConvolutionForward and cudnnAddTensor
+      // ------------- cudnn conv forward and bias add ---------------------
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+      CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
+          handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
          output_data));
-    };
-    workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    } else {
+      if (activation == "identity") {
+        algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+      }
+      // ------------------- cudnn conv+bias+act forward --------------------
+      ScalingParamType<T> alpha1 = 1.0f;
+      ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
+            handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, cudnn_workspace,
+            workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
+            cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
+            output_data));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
+    }
  }
 };
 #endif

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -491,8 +491,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {

 namespace ops = paddle::operators;

-REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConvMKLDNNOpKernel<float>);
-
-REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConvMKLDNNGradOpKernel<float>);
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kConvMKLDNNFP32,
+                                    ops::ConvMKLDNNGradOpKernel<float>);
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {

 framework::OpKernelType ConvOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
  framework::LibraryType library{framework::LibraryType::kPlain};
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  std::string data_format = ctx.Attr<std::string>("data_format");
@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
      platform::CanMKLDNNBeUsed(ctx)) {
    library = framework::LibraryType::kMKLDNN;
    layout = framework::DataLayout::kMKLDNN;
+    customized_type_value = kConvMKLDNNFP32;
  }
 #endif

@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
  }

  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
-                                 library);
+                                 library, customized_type_value);
 }

 void Conv2DOpMaker::Make() {
@@ -342,6 +345,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {

 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
+  int customized_type_value =
+      framework::OpKernelType::kDefaultCustomizedTypeValue;
  framework::LibraryType library_{framework::LibraryType::kPlain};
  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
  std::string data_format = ctx.Attr<std::string>("data_format");
@@ -357,12 +362,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
      platform::CanMKLDNNBeUsed(ctx)) {
    library_ = framework::LibraryType::kMKLDNN;
    layout_ = framework::DataLayout::kMKLDNN;
+    customized_type_value = kConvMKLDNNFP32;
  }
 #endif

  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+      layout_, library_, customized_type_value);
 }

 }  // namespace operators

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -27,6 +27,8 @@ namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
+constexpr int kConvMKLDNNFP32 = 1;
+constexpr int kConvMKLDNNINT8 = 2;

 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -177,11 +177,19 @@ struct CudnnRNNCache {
        seed_));

    CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
+
+#if CUDNN_VERSION >= 6000
    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
        CUDNN_LINEAR_INPUT,
        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
        CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
+#else
+    CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_DATA_FLOAT));
+#endif

    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
    CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -60,15 +60,37 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto x_var = ctx.InputVar("X");
+    PADDLE_ENFORCE(x_var != nullptr,
+                   "Cannot get input Variable X, variable name = %s",
+                   ctx.op().Input("X"));
    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
+
+    framework::Tensor x, *z;
+    if (x_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1,
+                     "For elementwise_op, if X is Sparse, Y must be scalar.");
+      auto& x_sele = x_var->Get<framework::SelectedRows>();
+      auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+      x = x_sele.value();
+      out_sele->set_rows(x_sele.rows());
+      out_sele->set_height(x_sele.height());
+      out_sele->mutable_value()->Resize(x_sele.value().dims());
+      out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
+      z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    } else if (x_var->IsType<framework::LoDTensor>()) {
+      x = x_var->Get<framework::LoDTensor>();
+      z = ctx.Output<framework::LoDTensor>("Out");
+    } else {
+      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                   x_var->Type().name());
+    }

    z->mutable_data<T>(ctx.GetPlace());
-    if (x->numel() == y->numel()) {
-      elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+    if (x.numel() == y->numel()) {
+      elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
    } else {
-      default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
+      default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
    }
  }
 };

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of elementwise op should not be null.");

-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("X").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
    PADDLE_ENFORCE(
        ctx->GetInputsVarType("Y").front() ==
            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front());
-
-    auto x_dim = ctx->GetInputDim("X");
-    auto y_dim = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.");
+        "The input var's type should be LoDTensor, but the received is %s [%s]",
+        ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
+
+    if (ctx->GetInputsVarType("X").front() ==
+        framework::proto::VarType::LOD_TENSOR) {
+      auto x_dim = ctx->GetInputDim("X");
+      auto y_dim = ctx->GetInputDim("Y");
+      PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                        "Rank of first input must >= rank of second input.");
+    } else if (ctx->GetInputsVarType("X").front() ==
+               framework::proto::VarType::SELECTED_ROWS) {
+      PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
+                         (ctx->GetInputDim("Y")[0] == 1),
+                     "For elementwise_op, if X is Sparse, "
+                     "Y must be scalar.");
+    } else {
+      PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
+                   ctx->GetInputsVarType("X").front());
+    }

    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");

--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
-  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
-    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+  if (platform::MayIUse(platform::avx)) {                                      \
+    math::VecActivations<T, platform::avx> act_functor;                        \
    act_gate = act_functor(act_gate_str);                                      \
    act_cell = act_functor(act_cell_str);                                      \
    act_cand = act_functor(act_cand_str);                                      \
  } else {                                                                     \
-    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    math::VecActivations<T, platform::isa_any> act_functor;                    \
    act_gate = act_functor(act_gate_str);                                      \
    act_cell = act_functor(act_cell_str);                                      \
    act_cand = act_functor(act_cand_str);                                      \

--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {

    std::function<void(const int, const T*, T*)> fc_act;
    auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
+    if (platform::MayIUse(platform::avx)) {
+      math::VecActivations<T, platform::avx> act_functor;
      fc_act = act_functor(fc_act_str);
    } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      math::VecActivations<T, platform::isa_any> act_functor;
      fc_act = act_functor(fc_act_str);
    }


--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+
+class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "GetTensorFromSelectedRowsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "GetTensorFromSelectedRowsOp must has output Out.");
+    PADDLE_ENFORCE(
+        ctx->GetInputsVarType("X").front() ==
+            framework::proto::VarType::SELECTED_ROWS,
+        "The input X's type should be SelectedRows, but the received is %s",
+        ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
+    PADDLE_ENFORCE(
+        ctx->GetOutputsVarType("Out").front() ==
+            framework::proto::VarType::LOD_TENSOR,
+        "The output Out's type should be LoDTensor, but the received is %s",
+        ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front());
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context());
+  }
+};
+
+class GetTensorFromSelectedRowsKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *x = ctx.Input<framework::SelectedRows>("X");
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+
+    out->Resize(x->value().dims());
+    out->mutable_data(ctx.GetPlace(), x->value().type());
+    framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(),
+                          out);
+  }
+};
+
+class GetTensorFromSelectedRowsOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input type is SelectedRows.");
+    AddOutput("Out", "The output type is LoDTensor.");
+    AddComment(
+        R"DOC(
+GetTensorFromSelectedRows Operator
+
+GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
+
+)DOC");
+  }
+};
+
+class GetTensorFromSelectedRowsOpVarTypeInference
+    : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const final {
+    auto out_var_name = op_desc.Output("Out").front();
+    auto in_var_name = op_desc.Input("X").front();
+
+    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
+    out_var.SetType(framework::proto::VarType::LOD_TENSOR);
+    out_var.SetDataType(in_var.GetDataType());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(get_tensor_from_selected_rows,
+                  ops::GetTensorFromSelectedRowsOp,
+                  ops::GetTensorFromSelectedRowsOpProtoMaker,
+                  ops::GetTensorFromSelectedRowsOpVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
+                               ops::GetTensorFromSelectedRowsKernel, double,
+                               ops::GetTensorFromSelectedRowsKernel, int,
+                               ops::GetTensorFromSelectedRowsKernel, int64_t,
+                               ops::GetTensorFromSelectedRowsKernel);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
+                                ops::GetTensorFromSelectedRowsKernel, double,
+                                ops::GetTensorFromSelectedRowsKernel, int,
+                                ops::GetTensorFromSelectedRowsKernel, int64_t,
+                                ops::GetTensorFromSelectedRowsKernel);
+#endif
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
                   "Output(W@Grad should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                   "Output(X@Grad should not be null.");
-    if (!ctx->Attrs().Get<bool>("is_sparse")) {
-      if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-        ctx->SetOutputDim(framework::GradVarName("Bias"),
-                          ctx->GetInputDim("Bias"));
-      }
-      ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
  }

 protected:

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
          ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
      w_grad->set_rows(real_rows);
      // Build a map of id -> row_index to speed up finding the index of one id
-      w_grad->SyncIndex();
      w_grad->set_height(w.dims()[0]);
      auto* w_grad_value = w_grad->mutable_value();
      framework::DDim temp_dim(w.dims());

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -59,6 +59,7 @@ math_library(matrix_bit_code)

 math_library(unpooling)
 math_library(vol2col)
+math_library(prelu)

 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -77,7 +77,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) {
 #endif

 // MKL scal only support inplace, choose this if src and dst are not equal
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_scal(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = a * x[i];
@@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) {
 }

 template <>
-inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
-                                                const float* x, float* y) {
+inline void vec_scal<float, platform::avx>(const int n, const float a,
+                                           const float* x, float* y) {
 #ifdef __AVX__
  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
-    vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+    vec_scal<float, platform::isa_any>(n, a, x, y);
    return;
  }
  const int rest = n % block;
@@ -114,24 +114,24 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
    y[i] = a * x[i];
  }
 #else
-  vec_scal<float, platform::jit::isa_any>(n, a, x, y);
+  vec_scal<float, platform::isa_any>(n, a, x, y);
 #endif
 }

 template <>
-inline void vec_scal<float, platform::jit::avx2>(const int n, const float a,
-                                                 const float* x, float* y) {
-  vec_scal<float, platform::jit::avx>(n, a, x, y);
+inline void vec_scal<float, platform::avx2>(const int n, const float a,
+                                            const float* x, float* y) {
+  vec_scal<float, platform::avx>(n, a, x, y);
 }

 template <>
-inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a,
-                                                    const float* x, float* y) {
+inline void vec_scal<float, platform::avx512f>(const int n, const float a,
+                                               const float* x, float* y) {
  // TODO(TJ): enable me
-  vec_scal<float, platform::jit::avx2>(n, a, x, y);
+  vec_scal<float, platform::avx2>(n, a, x, y);
 }

-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = a - x[i];
@@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
 }

 template <>
-inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
-                                                    const float* x, float* y) {
+inline void vec_bias_sub<float, platform::avx>(const int n, const float a,
+                                               const float* x, float* y) {
 #ifdef __AVX__
  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
-    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+    vec_bias_sub<float, platform::isa_any>(n, a, x, y);
    return;
  }
  const int rest = n % block;
@@ -168,27 +168,25 @@ inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
    y[i] = a - x[i];
  }
 #else
-  vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+  vec_bias_sub<float, platform::isa_any>(n, a, x, y);
 #endif
 }

 template <>
-inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
-                                                     const float* x, float* y) {
-  vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
+inline void vec_bias_sub<float, platform::avx2>(const int n, const float a,
+                                                const float* x, float* y) {
+  vec_bias_sub<float, platform::avx>(n, a, x, y);
 }

 template <>
-inline void vec_bias_sub<float, platform::jit::avx512f>(const int n,
-                                                        const float a,
-                                                        const float* x,
-                                                        float* y) {
+inline void vec_bias_sub<float, platform::avx512f>(const int n, const float a,
+                                                   const float* x, float* y) {
  // TODO(TJ): enable me
-  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
+  vec_bias_sub<float, platform::avx2>(n, a, x, y);
 }

 // out = x*y + (1-x)*z
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
  for (int i = 0; i < n; ++i) {
    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
@@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
 }

 template <>
-inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
-                                                 const float* y, const float* z,
-                                                 float* out) {
+inline void vec_cross<float, platform::avx>(const int n, const float* x,
+                                            const float* y, const float* z,
+                                            float* out) {
 #ifdef __AVX__
  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
-    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    vec_cross<float, platform::isa_any>(n, x, y, z, out);
    return;
  }
  const int rest = n % block;
@@ -228,25 +226,26 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
  }
 #else
-  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+  vec_cross<float, platform::isa_any>(n, x, y, z, out);
 #endif
 }

 template <>
-inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
-                                                  const float* y,
-                                                  const float* z, float* out) {
-  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+inline void vec_cross<float, platform::avx2>(const int n, const float* x,
+                                             const float* y, const float* z,
+                                             float* out) {
+  vec_cross<float, platform::avx>(n, x, y, z, out);
 }

 template <>
-inline void vec_cross<float, platform::jit::avx512f>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
+inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
+                                                const float* y, const float* z,
+                                                float* out) {
  // TODO(TJ): enable me
-  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+  vec_cross<float, platform::avx>(n, x, y, z, out);
 }

-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = x[i] + a;
@@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
 }

 template <>
-inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
-                                                    const float* x, float* y) {
+inline void vec_add_bias<float, platform::avx>(const int n, const float a,
+                                               const float* x, float* y) {
 #ifdef __AVX__
  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
-    vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+    vec_add_bias<float, platform::isa_any>(n, a, x, y);
    return;
  }
  const int rest = n % block;
@@ -283,32 +282,30 @@ inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
    y[i] = x[i] + a;
  }
 #else
-  vec_add_bias<float, platform::jit::isa_any>(n, a, x, y);
+  vec_add_bias<float, platform::isa_any>(n, a, x, y);
 #endif
 }

 template <>
-inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a,
-                                                     const float* x, float* y) {
-  vec_add_bias<float, platform::jit::avx>(n, a, x, y);
+inline void vec_add_bias<float, platform::avx2>(const int n, const float a,
+                                                const float* x, float* y) {
+  vec_add_bias<float, platform::avx>(n, a, x, y);
 }

 template <>
-inline void vec_add_bias<float, platform::jit::avx512f>(const int n,
-                                                        const float a,
-                                                        const float* x,
-                                                        float* y) {
+inline void vec_add_bias<float, platform::avx512f>(const int n, const float a,
+                                                   const float* x, float* y) {
  // TODO(TJ): enable me
-  vec_add_bias<float, platform::jit::avx2>(n, a, x, y);
+  vec_add_bias<float, platform::avx2>(n, a, x, y);
 }

-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_identity(const int n, const T* x, T* y) {
  // do nothing
  return;
 }

-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_sigmoid(const int n, const T* x, T* y) {
  const T min = SIGMOID_THRESHOLD_MIN;
  const T max = SIGMOID_THRESHOLD_MAX;
@@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
 }

 template <>
-inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
-                                                   float* y) {
+inline void vec_sigmoid<float, platform::avx>(const int n, const float* x,
+                                              float* y) {
 #ifdef __AVX__
  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block) {
-    vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+    vec_sigmoid<float, platform::isa_any>(n, x, y);
    return;
  }
  const int rest = n % block;
@@ -377,25 +374,24 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
    y[i] = 1.f / (1.f + y[i]);
  }
 #else
-  vec_sigmoid<float, platform::jit::isa_any>(n, x, y);
+  vec_sigmoid<float, platform::isa_any>(n, x, y);
 #endif
 }

 template <>
-inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x,
-                                                    float* y) {
-  vec_sigmoid<float, platform::jit::avx>(n, x, y);
+inline void vec_sigmoid<float, platform::avx2>(const int n, const float* x,
+                                               float* y) {
+  vec_sigmoid<float, platform::avx>(n, x, y);
 }

 template <>
-inline void vec_sigmoid<float, platform::jit::avx512f>(const int n,
-                                                       const float* x,
-                                                       float* y) {
+inline void vec_sigmoid<float, platform::avx512f>(const int n, const float* x,
+                                                  float* y) {
  // TODO(TJ): enable me
-  vec_sigmoid<float, platform::jit::avx2>(n, x, y);
+  vec_sigmoid<float, platform::avx2>(n, x, y);
 }

-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_tanh(const int n, const T* x, T* y) {
  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
  vec_sigmoid<T, isa>(n, y, y);
@@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) {
 }

 // TODO(TJ): make relu clip
-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_relu(const int n, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {
    y[i] = x[i] > 0 ? x[i] : 0;
@@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) {
 }

 template <>
-inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
-                                                float* y) {
+inline void vec_relu<float, platform::avx>(const int n, const float* x,
+                                           float* y) {
 #ifdef __AVX__
  constexpr int block = YMM_FLOAT_BLOCK;
  if (n < block * 4) {
-    vec_relu<float, platform::jit::isa_any>(n, x, y);
+    vec_relu<float, platform::isa_any>(n, x, y);
    return;
  }

@@ -441,26 +437,26 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
 #undef MOVE_ONE_STEP

 #else
-  vec_relu<float, platform::jit::isa_any>(n, x, y);
+  vec_relu<float, platform::isa_any>(n, x, y);
 #endif
 }

 template <>
-inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x,
-                                                 float* y) {
-  vec_relu<float, platform::jit::avx>(n, x, y);
+inline void vec_relu<float, platform::avx2>(const int n, const float* x,
+                                            float* y) {
+  vec_relu<float, platform::avx>(n, x, y);
 }

 template <>
-inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x,
-                                                    float* y) {
+inline void vec_relu<float, platform::avx512f>(const int n, const float* x,
+                                               float* y) {
  // TODO(TJ): enable me
-  vec_relu<float, platform::jit::avx2>(n, x, y);
+  vec_relu<float, platform::avx2>(n, x, y);
 }

 // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary

-template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 class VecActivations {
 public:
  std::function<void(const int, const T*, T*)> operator()(

--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
 }

 TEST(CpuVecTest, sigmoid) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
  using namespace paddle::operators::math;  // NOLINT
  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
    TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>,
+    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
+                        ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
+                        ref_sigmoid<float>);
+    TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
                        ref_sigmoid<float>);
  }
  TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }

 TEST(CpuVecTest, tanh) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
  using namespace paddle::operators::math;  // NOLINT
  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
    TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
+    TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
+                        ref_tanh<float>);
  }
  TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
 }

 TEST(CpuVecTest, relu) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
  using namespace paddle::operators::math;  // NOLINT
  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
    TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
+    TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
+                        ref_relu<float>);
  }
  TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
@@ -162,38 +166,40 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
 }

 TEST(CpuVecTest, inplace_sigmoid) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
  using namespace paddle::operators::math;  // NOLINT
  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
    TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>);
-    TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>,
+    TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
+                       ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
+                       ref_sigmoid<float>);
+    TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
                       ref_sigmoid<float>);
  }
  TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
 }

 TEST(CpuVecTest, inplace_tanh) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
  using namespace paddle::operators::math;  // NOLINT
  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
    TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>);
-    TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
+    TestInplace<float>(sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
  }
  TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
 }

 TEST(CpuVecTest, inplace_relu) {
-  namespace jit = paddle::platform::jit;
+  namespace platform = paddle::platform;
  using namespace paddle::operators::math;  // NOLINT
  for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
    TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>);
-    TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
+    TestInplace<float>(sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
  }
  TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
 }
--- a/paddle/fluid/operators/math/jit_code.cc
+++ b/paddle/fluid/operators/math/jit_code.cc
@@ -22,7 +22,7 @@ namespace math {
 namespace jitkernel {
 namespace gen {

-using namespace platform::jit;  // NOLINT
+using namespace platform;  // NOLINT

 bool VXXJitCode::init(int d, int scalar_index) {
  // It's not necessary to use avx512 since it would slow down the frequency

--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -179,7 +179,7 @@ class VActJitCode : public JitCode {
  template <typename JMM>
  void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12,  // NOLINT
               int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
-    using namespace platform::jit;  // NOLINT
+    using namespace platform;  // NOLINT
    // check all idx can not equal
    JMM jmm_src = JMM(src_idx);
    JMM jmm_fx = JMM(fx_idx);

--- a/paddle/fluid/operators/math/jit_gen.cc
+++ b/paddle/fluid/operators/math/jit_gen.cc
@@ -36,7 +36,7 @@ void JitCode::preCode() {
  for (int i = 0; i < num_g_abi_regs; ++i) {
    push(Xbyak::Reg64(g_abi_regs[i]));
  }
-  if (platform::jit::MayIUse(platform::jit::avx512f)) {
+  if (platform::MayIUse(platform::avx512f)) {
    mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
  }
 }

--- a/paddle/fluid/operators/math/jit_kernel.cc
+++ b/paddle/fluid/operators/math/jit_kernel.cc
@@ -21,8 +21,6 @@ namespace operators {
 namespace math {
 namespace jitkernel {

-namespace jit = platform::jit;
-
 KernelPool& KernelPool::Instance() {
  static thread_local KernelPool g_jit_kernels;
  return g_jit_kernels;

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -30,7 +30,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace jit = platform::jit;

 #ifdef PADDLE_WITH_MKLML
 template <typename T>
@@ -125,7 +124,7 @@ bool VMulKernelImpl<float>::useJIT(int d) {
 #ifdef PADDLE_WITH_MKLML
 template <>
 bool VMulKernelImpl<float>::useMKL(int d) {
-  return jit::MayIUse(jit::avx512f) && d > 512;
+  return platform::MayIUse(platform::avx512f) && d > 512;
 }

 template <>

--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -25,10 +25,8 @@ namespace operators {
 namespace math {
 namespace jitkernel {

-namespace jit = platform::jit;
-
 /* CRF Decode JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T, platform::cpu_isa_t isa, jit_block>
 class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
 public:
  explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
@@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {

 #define INTRIAVX_FLOAT(block)                                                  \
  template <>                                                                  \
-  CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl(            \
+  CRFDecodeKernelImpl<float, platform::avx, block>::CRFDecodeKernelImpl(       \
      int tag_num)                                                             \
      : CRFDecodeKernel<float>() {                                             \
    this->num_ = tag_num;                                                      \
@@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
  }                                                                            \
  template <>                                                                  \
-  void CRFDecodeKernelImpl<float, jit::avx, block>::Compute(                   \
+  void CRFDecodeKernelImpl<float, platform::avx, block>::Compute(              \
      const int seq_len, const float* x, const float* w, float* alpha,         \
      int* track) const {                                                      \
    INIT_ALPHA(YMM_FLOAT_BLOCK)                                                \
@@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {

 #define INTRIAVX512_FLOAT(block)                                               \
  template <>                                                                  \
-  CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl(        \
+  CRFDecodeKernelImpl<float, platform::avx512f, block>::CRFDecodeKernelImpl(   \
      int tag_num)                                                             \
      : CRFDecodeKernel<float>() {                                             \
    this->num_ = tag_num;                                                      \
@@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
    this->rest_ = this->num_ % ZMM_FLOAT_BLOCK;                                \
  }                                                                            \
  template <>                                                                  \
-  void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute(               \
+  void CRFDecodeKernelImpl<float, platform::avx512f, block>::Compute(          \
      const int seq_len, const float* x, const float* w, float* alpha,         \
      int* track) const {                                                      \
    INIT_ALPHA(ZMM_FLOAT_BLOCK)                                                \
@@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16);
 INTRIAVX_FLOAT(kGT16);
 #endif
 #ifdef __AVX2__
-INTRIAVX2_FLOAT(jit::avx2, kEQ8);
-INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
-INTRIAVX2_FLOAT(jit::avx2, kEQ16);
-INTRIAVX2_FLOAT(jit::avx2, kGT16);
+INTRIAVX2_FLOAT(platform::avx2, kEQ8);
+INTRIAVX2_FLOAT(platform::avx2, kGT8LT16);
+INTRIAVX2_FLOAT(platform::avx2, kEQ16);
+INTRIAVX2_FLOAT(platform::avx2, kGT16);
 #endif
 #ifdef __AVX512F__
-INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
-INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);
+INTRIAVX2_FLOAT(platform::avx512f, kEQ8);
+INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16);
 INTRIAVX512_FLOAT(kEQ16);
 INTRIAVX512_FLOAT(kGT16);
 #endif

--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -29,7 +29,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 namespace jitkernel {
-namespace jit = platform::jit;

 #ifdef PADDLE_WITH_MKLML
 // try to use MKL to speedup

--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -22,10 +22,8 @@ namespace operators {
 namespace math {
 namespace jitkernel {

-namespace jit = platform::jit;
-
 /* Layer Norm JitKernel */
-template <typename T, platform::jit::cpu_isa_t isa, jit_block>
+template <typename T, platform::cpu_isa_t isa, jit_block>
 class LayerNormKernelImpl : public LayerNormKernel<T> {
 public:
  explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
@@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
    this->end_ = this->num_ - this->rest_;                                     \
  }                                                                            \
  template <>                                                                  \
-  void LayerNormKernelImpl<float, jit::avx, block>::Compute(                   \
+  void LayerNormKernelImpl<float, platform::avx, block>::Compute(              \
      float* x, float* out, float* mean, float* var, const float* scale,       \
      const float* bias, int height, const float epsilon) const {              \
    __m256 sum;                                                                \
@@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
  }

 #ifdef __AVX__
-INTRIAVX_FLOAT(jit::avx, kEQ8);
-INTRIAVX_FLOAT(jit::avx, kGT8LT16);
-INTRIAVX_FLOAT(jit::avx, kEQ16);
-INTRIAVX_FLOAT(jit::avx, kGT16);
+INTRIAVX_FLOAT(platform::avx, kEQ8);
+INTRIAVX_FLOAT(platform::avx, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx, kEQ16);
+INTRIAVX_FLOAT(platform::avx, kGT16);
 #endif
 #ifdef __AVX2__
-INTRIAVX_FLOAT(jit::avx2, kEQ8);
-INTRIAVX_FLOAT(jit::avx2, kGT8LT16);
-INTRIAVX_FLOAT(jit::avx2, kEQ16);
-INTRIAVX_FLOAT(jit::avx2, kGT16);
+INTRIAVX_FLOAT(platform::avx2, kEQ8);
+INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx2, kEQ16);
+INTRIAVX_FLOAT(platform::avx2, kGT16);
 #endif

 #undef INTRIAVX_FLOAT

--- a/paddle/fluid/operators/math/jit_kernel_macro.h
+++ b/paddle/fluid/operators/math/jit_kernel_macro.h
@@ -92,7 +92,6 @@ namespace jitkernel {
                          JITKERNEL_DECLARE, JITKERNEL_FIND_KEY,     \
                          JITKERNEL_IMPL)

-namespace jit = platform::jit;
 // TODO(TJ): below defines are deprecated, would be remove recently
 #define SEARCH_BLOCK(macro_, ker, dtype, isa)              \
  if (d < YMM_FLOAT_BLOCK) {                               \
@@ -107,15 +106,15 @@ namespace jit = platform::jit;
    macro_(ker, dtype, isa, kGT16);                        \
  }

-#define SEARCH_ISA_BLOCK(macro_, ker, dtype)        \
-  if (jit::MayIUse(jit::avx512f)) {                 \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \
-  } else if (jit::MayIUse(jit::avx2)) {             \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::avx2);    \
-  } else if (jit::MayIUse(jit::avx)) {              \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::avx);     \
-  } else {                                          \
-    SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
+#define SEARCH_ISA_BLOCK(macro_, ker, dtype)             \
+  if (platform::MayIUse(platform::avx512f)) {            \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \
+  } else if (platform::MayIUse(platform::avx2)) {        \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::avx2);    \
+  } else if (platform::MayIUse(platform::avx)) {         \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::avx);     \
+  } else {                                               \
+    SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \
  }

 #define JITKERNEL_KEY(ker_key, dtype_key) \
@@ -156,10 +155,10 @@ namespace jit = platform::jit;
                                  marco_declare, macro_key, macro_impl)

 #define FOR_EACH_ISA(macro_, block) \
-  macro_(jit::avx512f, block);      \
-  macro_(jit::avx2, block);         \
-  macro_(jit::avx, block);          \
-  macro_(jit::isa_any, block)
+  macro_(platform::avx512f, block); \
+  macro_(platform::avx2, block);    \
+  macro_(platform::avx, block);     \
+  macro_(platform::isa_any, block)

 #define FOR_EACH_BLOCK(macro_, isa) \
  macro_(isa, kLT8);                \
@@ -168,11 +167,11 @@ namespace jit = platform::jit;
  macro_(isa, kEQ16);               \
  macro_(isa, kGT16)

-#define FOR_EACH_ISA_BLOCK(macro_)      \
-  FOR_EACH_BLOCK(macro_, jit::avx512f); \
-  FOR_EACH_BLOCK(macro_, jit::avx2);    \
-  FOR_EACH_BLOCK(macro_, jit::avx);     \
-  FOR_EACH_BLOCK(macro_, jit::isa_any)
+#define FOR_EACH_ISA_BLOCK(macro_)           \
+  FOR_EACH_BLOCK(macro_, platform::avx512f); \
+  FOR_EACH_BLOCK(macro_, platform::avx2);    \
+  FOR_EACH_BLOCK(macro_, platform::avx);     \
+  FOR_EACH_BLOCK(macro_, platform::isa_any)

 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -705,7 +705,7 @@ TEST(JitKernel, pool) {
  jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);

  // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
-  paddle::platform::jit::MayIUse(paddle::platform::jit::avx);
+  paddle::platform::MayIUse(paddle::platform::avx);
  const auto& plstm1 =
      jit::KernelPool::Instance()
          .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);

--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -89,6 +89,8 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
                                  const framework::Tensor& weight,
                                  const framework::Tensor& input) {
+  auto blas =
+      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
  size_t num_samples = tmat->dims()[0];
  size_t tmat_width = tmat->dims()[1];
  size_t input_width = input.dims()[1];
@@ -99,13 +101,12 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
  for (size_t i = 0; i < num_samples; ++i) {
    auto code = code_table_->get_code(i);
    int code_length = code->get_length();
+    const T* input_row = input_value + input_width * i;
    for (int j = 0; j < code_length; ++j) {
      size_t index = code->calc_index(j);
+      const T* weight_row = weight_value + weight_width * index;
      T sum = static_cast<T>(0.0);
-      for (size_t k = 0; k < input_width; ++k) {
-        sum += weight_value[weight_width * index + k] *
-               input_value[input_width * i + k];
-      }
+      sum = blas.DOT(input_width, weight_row, input_row);
      tmat_value[i * tmat_width + j] += sum;
    }
  }
@@ -115,6 +116,8 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                            framework::Tensor* weight,
                                            const framework::Tensor& input) {
+  auto blas =
+      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
  size_t num_samples = tmat.dims()[0];
  size_t input_width = input.dims()[1];
  size_t tmat_width = tmat.dims()[1];
@@ -122,16 +125,25 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
  auto tmat_value = tmat.data<T>();
  auto weight_value = weight->data<T>();
  auto input_value = input.data<T>();
+
+  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
+
  for (size_t i = 0; i < num_samples; ++i) {
    auto code = code_table_->get_code(i);
    int code_length = code->get_length();
+    const T* input_value_row = input_value + input_width * i;
+    const T* tmat_row = tmat_value + i * tmat_width;
    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-
-      for (size_t k = 0; k < input_width; ++k) {
-        weight_value[weight_width * index + k] +=
-            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
-      }
+      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+    }
+  }
+  for (auto& op : ops) {
+    auto& op_in_row = op.second;
+    for (auto& pair : op_in_row) {
+      auto& scale = pair.first;
+      auto* input_row = pair.second;
+      T* weight_row = weight_value + op.first * weight_width;
+      blas.AXPY(input_width, scale, input_row, weight_row);
    }
  }
 }
@@ -140,6 +152,8 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                            framework::SelectedRows* weight,
                                            const framework::Tensor& input) {
+  auto blas =
+      GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
  size_t num_samples = tmat.dims()[0];
  size_t input_width = input.dims()[1];
  size_t tmat_width = tmat.dims()[1];
@@ -147,17 +161,28 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
  auto tmat_value = tmat.data<T>();
  auto weight_value = weight->mutable_value()->data<T>();
  auto input_value = input.data<T>();
+
+  std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
+  ops.reserve(weight->rows().size());
+
  for (size_t i = 0; i < num_samples; ++i) {
    auto code = code_table_->get_code(i);
    int code_length = code->get_length();
+    const T* input_value_row = input_value + input_width * i;
+    const T* tmat_row = tmat_value + i * tmat_width;
    for (int j = 0; j < code_length; ++j) {
-      size_t index = code->calc_index(j);
-      for (size_t k = 0; k < input_width; ++k) {
-        int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index));
-        weight_value[row_index * weight_width + k] +=
-            tmat_value[i * tmat_width + j] * input_value[input_width * i + k];
-      }
+      ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
+    }
+  }
+
+  for (auto& row : weight->rows()) {
+    auto& op_in_row = ops[row];
+    for (auto& pair : op_in_row) {
+      auto& scale = pair.first;
+      auto* input_row = pair.second;
+      blas.AXPY(input_width, scale, input_row, weight_value);
    }
+    weight_value += weight_width;
  }
 }


--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <unordered_map>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"

 #if defined(_WIN32)

--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/prelu.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+static const int CUDA_NUM_THREADS = 1024;
+static const int CUDA_MAX_NUM_BLOCKS = 65535;
+inline static int GET_NUM_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
+                                       T *output, int channel,
+                                       size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const T *in = input + offset;
+  T *out = output + offset;
+  T scale = alpha[blockIdx.x % channel];
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    T x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
+                                       T *output, size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const T *in = input + offset;
+  const T *scale = alpha + offset;
+  T *out = output + offset;
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    T x = in[i];
+    out[i] = (x > 0) ? x : scale[i] * x;
+  }
+}
+
+template <typename T>
+__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
+                                  size_t spatial_size) {
+  size_t offset = blockIdx.x * spatial_size;
+  const T *in = input + offset;
+  T scale = *alpha;
+  T *out = output + offset;
+
+  for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
+    T x = in[i];
+    out[i] = (x > 0) ? x : scale * x;
+  }
+}
+
+template <typename T>
+static inline void PReluChannelWise(cudaStream_t stream, const T *input,
+                                    const T *alpha, T *output,
+                                    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, input_shape[1], spatial_size);
+}
+
+template <typename T>
+static inline void PReluElementWise(cudaStream_t stream, const T *input,
+                                    const T *alpha, T *output,
+                                    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template <typename T>
+static inline void PReluScalar(cudaStream_t stream, const T *input,
+                               const T *alpha, T *output,
+                               std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template <typename T>
+void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
+    cudaStream_t stream, const T *input, const T *alpha, T *output,
+    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, input_shape[1], spatial_size);
+}
+
+template <typename T>
+void PreluElementWiseDirectCUDAFunctor<T>::operator()(
+    cudaStream_t stream, const T *input, const T *alpha, T *output,
+    std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template <typename T>
+void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
+                                                 const T *input, const T *alpha,
+                                                 T *output,
+                                                 std::vector<int> input_shape) {
+  size_t unroll = input_shape[0] * input_shape[1];
+  size_t spatial_size = input_shape[2] * input_shape[3];
+  CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
+  PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
+      input, alpha, output, spatial_size);
+}
+
+template class PreluChannelWiseDirectCUDAFunctor<float>;
+template class PreluChannelWiseDirectCUDAFunctor<double>;
+
+template class PreluElementWiseDirectCUDAFunctor<float>;
+template class PreluElementWiseDirectCUDAFunctor<double>;
+
+template class PreluScalarDirectCUDAFunctor<float>;
+template class PreluScalarDirectCUDAFunctor<double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class PreluChannelWiseDirectCUDAFunctor {
+ public:
+  void operator()(cudaStream_t stream, const T *input, const T *alpha,
+                  T *output, std::vector<int> input_shape);
+};
+
+template <typename T>
+class PreluElementWiseDirectCUDAFunctor {
+ public:
+  void operator()(cudaStream_t stream, const T *input, const T *alpha,
+                  T *output, std::vector<int> input_shape);
+};
+
+template <typename T>
+class PreluScalarDirectCUDAFunctor {
+ public:
+  void operator()(cudaStream_t stream, const T *input, const T *alpha,
+                  T *output, std::vector<int> input_shape);
+};
+#endif
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_selected_rows_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MergeSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MergeSelectedRowsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MergeSelectedRowsOp should not be null.");
+    ctx->ShareDim("X", /*->*/ "Out");
+  }
+};
+
+class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input type is SelectedRows, and the selected rows may be "
+             "duplicated.");
+    AddOutput("Out",
+              "The output type is SelectedRows, and the selected rows are not "
+              "duplicated.");
+    AddComment(
+        R"DOC(
+MergeSelectedRows Operator.
+
+MergeSelectedRows is used to merge the duplicated rows of the input.
+)DOC");
+  }
+};
+
+class MergeSelectedRowsOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp,
+                  ops::MergeSelectedRowsOpMaker,
+                  ops::MergeSelectedRowsOpInferVarType);
+
+REGISTER_OP_CPU_KERNEL(
+    merge_selected_rows,
+    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, float>,
+    ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_selected_rows_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    merge_selected_rows,
+    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, float>,
+    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/merge_selected_rows_op.h
+++ b/paddle/fluid/operators/merge_selected_rows_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MergeSelectedRowsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::SelectedRows>("X");
+    auto* out = context.Output<framework::SelectedRows>("Out");
+
+    math::scatter::MergeAdd<DeviceContext, T> merge_func;
+    merge_func(context.template device_context<DeviceContext>(), *x, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
  }
 };


--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/prelu.h"
+#include "paddle/fluid/operators/prelu_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CUDAPReluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* out = context.Output<Tensor>("Out");
+
+    const T* x_ptr = x->data<T>();
+    T* o_ptr = out->mutable_data<T>(context.GetPlace());
+
+    const T* alpha_ptr = alpha->data<T>();
+    auto& mode = context.Attr<std::string>("mode");
+
+    int numel = x->numel();
+    auto dim = x->dims();
+    std::vector<int> input_shape = framework::vectorize2int(dim);
+
+    if (mode == "channel") {
+      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
+      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
+                         alpha_ptr, o_ptr, input_shape);
+    } else if (mode == "element") {
+      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
+      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
+                         alpha_ptr, o_ptr, input_shape);
+    } else {
+      math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
+      prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
+                   o_ptr, input_shape);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() {
  return CUDAPinnedMaxAllocSize() / 256;
 }

-namespace jit {
 #ifdef PADDLE_WITH_XBYAK
 static Xbyak::util::Cpu cpu;
 bool MayIUse(const cpu_isa_t cpu_isa) {
@@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
 }
 #endif

-}  // namespace jit
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();

-namespace jit {
 typedef enum {
  isa_any,
  sse42,
@@ -55,7 +54,5 @@ typedef enum {
 // May I use some instruction
 bool MayIUse(const cpu_isa_t cpu_isa);

-}  // namespace jit
-
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
          case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
            auto *kernel =
                reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
-            tracer->AddKernelRecords(kernel->start, kernel->end,
+            tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
                                     kernel->deviceId, kernel->streamId,
                                     kernel->correlationId);
            break;
@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer {
                                     stream_id, correlation_id, bytes});
  }

-  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                        int64_t stream_id, uint32_t correlation_id) {
+  void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
+                        int64_t device_id, int64_t stream_id,
+                        uint32_t correlation_id) {
    // 0 means timestamp information could not be collected for the kernel.
    if (start == 0 || end == 0) {
      VLOG(3) << correlation_id << " cannot be traced";
@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer {
    }
    std::lock_guard<std::mutex> l(trace_mu_);
    kernel_records_.push_back(
-        KernelRecord{start, end, device_id, stream_id, correlation_id});
+        KernelRecord{name, start, end, device_id, stream_id, correlation_id});
  }

  bool IsEnabled() {
@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer {
    profile_pb.set_start_ns(start_ns_);
    profile_pb.set_end_ns(end_ns_);
    for (const KernelRecord &r : kernel_records_) {
-      if (correlations_.find(r.correlation_id) == correlations_.end()) {
-        fprintf(stderr, "cannot relate a kernel activity\n");
-        continue;
-      }
      auto *event = profile_pb.add_events();
      event->set_type(proto::Event::GPUKernel);
-      event->set_name(correlations_.at(r.correlation_id));
+      if (correlations_.find(r.correlation_id) != correlations_.end()) {
+        event->set_name(correlations_.at(r.correlation_id));
+      } else {
+        event->set_name(r.name);
+      }
      event->set_start_ns(r.start_ns);
      event->set_end_ns(r.end_ns);
      event->set_sub_device_id(r.stream_id);

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() {
 class DeviceTracer {
 public:
  struct KernelRecord {
+    std::string name;
    uint64_t start_ns;
    uint64_t end_ns;
    int64_t device_id;
@@ -84,8 +85,9 @@ class DeviceTracer {

  // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
  // added before for human readability.
-  virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
-                                int64_t stream_id, uint32_t correlation_id) = 0;
+  virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
+                                int64_t device_id, int64_t stream_id,
+                                uint32_t correlation_id) = 0;

  // Generate a proto after done (Disabled).
  virtual proto::Profile GenProfile(const std::string& profile_path) = 0;

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnRNNBackwardWeights);                       \
  __macro(cudnnRNNForwardInference);                      \
  __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);                     \
-  __macro(cudnnSetRNNDescriptor_v6);
+  __macro(cudnnDestroyRNNDescriptor);

 CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)

@@ -165,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

+// APIs in R6
+#if CUDNN_VERSION >= 6000
+#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro)        \
  __macro(cudnnSetConvolutionGroupCount);         \

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"

 #ifndef _WIN32
 constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
@@ -45,6 +46,15 @@ DEFINE_bool(
    "input and output must be half precision) and recurrent neural networks "
    "(RNNs).");

+DEFINE_string(selected_gpus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (GPU). If you want to use "
+              "all visible devices, set this to empty string. NOTE: the "
+              "reason of doing this is that we want to use P2P communication"
+              "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+              "share-memory only.");
+
 namespace paddle {
 namespace platform {

@@ -121,6 +131,24 @@ int GetCurrentDeviceId() {
  return device_id;
 }

+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices() {
+  // use user specified GPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_gpus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetCUDADeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
 void SetDeviceId(int id) {
  // TODO(qijun): find a better way to cache the cuda device count
  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cuda_runtime.h>
 #include <stddef.h>
 #include <string>
+#include <vector>

 namespace paddle {
 namespace platform {
@@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i);
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();

+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedDevices();
+
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);


--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/string/split.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) {
  std::vector<int> devices;
 #ifdef PADDLE_WITH_CUDA
  try {
-    int count = platform::GetCUDADeviceCount();
-    for (int i = 0; i < count; ++i) {
-      devices.push_back(i);
-    }
+    // use user specified GPUs in single-node multi-process mode.
+    devices = platform::GetSelectedDevices();
  } catch (const std::exception &exp) {
    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
  }
@@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) {

 void InitDevices(bool init_p2p, const std::vector<int> devices) {
  std::vector<platform::Place> places;
-  int count = 0;
-#ifdef PADDLE_WITH_CUDA
-  try {
-    count = platform::GetCUDADeviceCount();
-  } catch (const std::exception &exp) {
-    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
-  }
-#endif

  for (size_t i = 0; i < devices.size(); ++i) {
-    if (devices[i] >= count || devices[i] < 0) {
+    // In multi process multi gpu mode, we may have gpuid = 7
+    // but count = 1.
+    if (devices[i] < 0) {
      LOG(WARNING) << "Invalid devices id.";
      continue;
    }
+
    places.emplace_back(platform::CUDAPlace(devices[i]));
  }
  if (init_p2p) {
@@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #endif

 #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
-  if (platform::jit::MayIUse(platform::jit::avx)) {
+  if (platform::MayIUse(platform::avx)) {
 #ifndef __AVX__
    LOG(WARNING) << "AVX is available, Please re-compile on local machine";
 #endif
@@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
         " version or compile from source code."

 #ifdef __AVX512F__
-  if (!platform::jit::MayIUse(platform::jit::avx512f)) {
-    if (platform::jit::MayIUse(platform::jit::avx2)) {
+  if (!platform::MayIUse(platform::avx512f)) {
+    if (platform::MayIUse(platform::avx2)) {
      AVX_GUIDE(AVX512, AVX2);
-    } else if (platform::jit::MayIUse(platform::jit::avx)) {
+    } else if (platform::MayIUse(platform::avx)) {
      AVX_GUIDE(AVX512, AVX);
    } else {
      AVX_GUIDE(AVX512, NonAVX);
@@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #endif

 #ifdef __AVX2__
-  if (!platform::jit::MayIUse(platform::jit::avx2)) {
-    if (platform::jit::MayIUse(platform::jit::avx)) {
+  if (!platform::MayIUse(platform::avx2)) {
+    if (platform::MayIUse(platform::avx)) {
      AVX_GUIDE(AVX2, AVX);
    } else {
      AVX_GUIDE(AVX2, NonAVX);
@@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
 #endif

 #ifdef __AVX__
-  if (!platform::jit::MayIUse(platform::jit::avx)) {
+  if (!platform::MayIUse(platform::avx)) {
    AVX_GUIDE(AVX, NonAVX);
  }
 #endif

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -97,7 +97,7 @@ struct NCCLContextMap {
        order_.size(), contexts_.size(),
        "NCCL Context Map does not support contain two or more same device");

-    if (places.size() <= 1) {
+    if (places.size() <= 1 && num_trainers == 1) {
      return;
    }
    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
@@ -111,12 +111,19 @@ struct NCCLContextMap {
      {
        int nranks = num_trainers * order_.size();
        NCCLGroupGuard gurad;
-        for (auto &gpu_id : order_) {
-          int rank = trainer_id * order_.size() + gpu_id;
-          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks
+                   << "gpu id: " << gpu_id;
          PADDLE_ENFORCE(cudaSetDevice(gpu_id));
          PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
-              comms.get() + gpu_id, nranks, *nccl_id, rank));
+              comms.get() + i, nranks, *nccl_id, rank));
        }
      }
    }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt

-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc)
+
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+namespace paddle {
+namespace pybind {
+
+// Bind Methods
+void BindTracer(pybind11::module *m) {
+  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
+      .def("__init__",
+           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+             new (&self) imperative::Tracer(root_block);
+           })
+      .def("trace", &imperative::Tracer::Trace)
+      .def("get_scope", &imperative::Tracer::GetScope,
+           pybind11::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+class PyLayer : public imperative::Layer {
+ public:
+  using imperative::Layer::Layer;  // Inherit constructors
+
+  std::vector<imperative::VarBase> Forward(
+      const std::vector<imperative::VarBase>& inputs) override {
+    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
+                      inputs);  // NOLINT
+  }
+
+  void Backward() override {
+    PYBIND11_OVERLOAD(void, Layer, Backward, );  // NOLINT
+  }
+};
+
+class PyOpBase : public imperative::OpBase {
+ public:
+  using imperative::OpBase::OpBase;  // Inherit constructors
+};
+
+class PyVarBase : public imperative::VarBase {
+ public:
+  using imperative::VarBase::VarBase;  // Inherit constructors
+};
+
+void BindTracer(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
@@ -45,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/async_executor_py.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
@@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) {

  BindException(&m);

+  py::class_<imperative::VarBase, PyVarBase>(m, "VarBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def("_run_backward",
+           [](imperative::VarBase &self, framework::Scope *scope) {
+             self.RunBackward(scope);
+           })
+      .def("_grad", &imperative::VarBase::Grad)
+      .def_property(
+          "desc",
+          [](const imperative::VarBase &self) { return self.var_desc_; },
+          [](imperative::VarBase &self, framework::VarDesc *var_desc) {
+            self.var_desc_ = var_desc;
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
+      .def(py::init<>())
+      .def_property(
+          "desc", [](const imperative::OpBase &self) { return self.op_desc_; },
+          [](imperative::OpBase &self, framework::OpDesc *op_desc) {
+            if (op_desc) {
+              self.op_desc_ = op_desc;
+            }
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::Layer, PyLayer /* <--- trampoline*/> layer(m, "Layer");
+  layer.def(py::init<>())
+      .def("forward",
+           [](imperative::Layer &self,
+              const std::vector<imperative::VarBase> &inputs) {
+             return self.Forward(inputs);
+           })
+      .def("backward", &imperative::Layer::Backward);
+  BindTracer(&m);
+
  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
      .def_buffer(
          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
@@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle.

  m.def("set_feed_variable", framework::SetFeedVariable);
  m.def("get_fetch_variable", framework::GetFetchVariable);
+  m.def("get_variable_tensor", framework::GetVariableTensor);

  m.def("_is_program_version_supported", IsProgramVersionSupported);


--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
+cc_test(split_test SRCS split_test.cc)
--- a/paddle/fluid/string/split.h
+++ b/paddle/fluid/string/split.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace string {
+
+static inline std::vector<std::string> Split(std::string const& original,
+                                             char separator) {
+  std::vector<std::string> results;
+  std::string token;
+  std::istringstream is(original);
+  while (std::getline(is, token, separator)) {
+    if (!token.empty()) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+}  // namespace string
+}  // namespace paddle
--- a/paddle/fluid/string/split_test.cc
+++ b/paddle/fluid/string/split_test.cc
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/string/split.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringSplit, StringSplit) {
+  std::string to_split = "0,1,2,3,4,5";
+  int i = 0;
+  for (auto s : paddle::string::Split(to_split, ',')) {
+    EXPECT_EQ(atoi(s.c_str()), i);
+    i++;
+  }
+}
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -437,13 +437,11 @@ EOF
        export http_proxy=
        export https_proxy=
        # TODO: jiabin need to refine this part when these tests fixed on mac
-        ctest --output-on-failure -j $1
+        ctest --output-on-failure -j $2
        # make install should also be test when unittest
        make install -j 8
        if [ "$1" == "cp27-cp27m" ]; then
            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
-            set -e
-            python -c "import paddle.fluid"
        elif [ "$1" == "cp35-cp35m" ]; then
            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        elif [ "$1" == "cp36-cp36m" ]; then
@@ -918,7 +916,7 @@ function main() {
      maccheck)
        cmake_gen ${PYTHON_ABI:-""}
        build_mac
-        run_mac_test ${PROC_RUN:-1}
+        run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
        ;;
      macbuild)
        cmake_gen ${PYTHON_ABI:-""}

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -34,6 +34,7 @@ from . import io
 from . import evaluator
 from . import initializer
 from . import layers
+from . import imperative
 from . import contrib
 from . import nets
 from . import optimizer
@@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'initializer',
        'layers',
        'contrib',
+        'imperative',
        'transpiler',
        'nets',
        'optimizer',
@@ -147,7 +149,7 @@ def __bootstrap__():
        read_env_flags += [
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search'
+            'cudnn_exhaustive_search', 'selected_gpus'
        ]
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                    "All parameters' 'clip_norm' of a same group should be the same"
                )

-        square = grad * grad
+        merge_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            merge_grad = layers.merge_selected_rows(grad)
+            merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+
+        square = layers.square(merge_grad)
        local_norm_var = layers.reduce_sum(input=square)
        context[self.group_name].append(local_norm_var)

@@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):

        new_grad = layers.elementwise_mul(
            x=grad, y=self.context[group_scale_name])
+
        return param, new_grad



--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import sys

 import numpy as np

@@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()

+_imperative_tracer_ = None
+
+
+def _in_imperative_mode():
+    return _imperative_tracer_ is not None
+
+
+def _imperative_tracer():
+    return _imperative_tracer_
+

 class NameScope(object):
    def __init__(self, name="", parent=None):
@@ -202,7 +213,7 @@ def _debug_string_(proto, throw_on_error=True):
    return proto.__str__()


-class Variable(object):
+class Variable(core.VarBase):
    """
    In Fluid, every input and output of an operator is a variable. In most
    cases, variables are used for holding different kinds of data or training
@@ -266,6 +277,7 @@ class Variable(object):
                 stop_gradient=False,
                 is_data=False,
                 **kwargs):
+        core.VarBase.__init__(self)
        self.block = block
        self.error_clip = error_clip

@@ -346,6 +358,18 @@ class Variable(object):
        self.stop_gradient = stop_gradient
        self.is_data = is_data

+    def _numpy(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        tensor = core.get_variable_tensor(scope, self.desc.name())
+        return np.array(tensor)
+
+    def _backward(self):
+        scope = _imperative_tracer().get_scope(self.block.desc)
+        self._run_backward(scope)
+
+    def _gradient(self):
+        return np.array(self._grad())
+
    def __str__(self):
        return self.to_string(True)

@@ -492,7 +516,7 @@ class OpProtoHolder(object):
        }


-class Operator(object):
+class Operator(core.OpBase):
    """
    In Fluid, all the operation are represented by Operator, and Operator
    is regarded as a build in an instruction of a Block. Users can use the
@@ -548,6 +572,7 @@ class Operator(object):
                 inputs=None,
                 outputs=None,
                 attrs=None):
+        core.OpBase.__init__(self)
        self.block = block
        self.desc = desc
        # note: not add self.attrs here:
@@ -587,6 +612,7 @@ class Operator(object):
                    return True
            return False

+        self.inputs = []
        if inputs is not None:
            for in_proto in proto.inputs:
                found = find_name(inputs, in_proto.name)
@@ -613,6 +639,13 @@ class Operator(object):
                else:
                    self.desc.set_input(in_proto.name, [])

+            for inp in inputs.values():
+                if isinstance(inp, Variable):
+                    self.inputs.append(inp)
+                elif isinstance(inp, list) or isinstance(inp, tuple):
+                    self.inputs.extend(inp[:])
+
+        self.outputs = []
        if outputs is not None:
            given = set()
            need = set()
@@ -641,6 +674,12 @@ class Operator(object):
                    arg.op = self
                self.desc.set_output(out_proto.name, out_arg_names)

+            for out in outputs.values():
+                if isinstance(out, Variable):
+                    self.outputs.append(out)
+                elif isinstance(out, list) or isinstance(out, tuple):
+                    self.outputs.extend(out[:])
+
        if op_attrs is not None:
            if not isinstance(op_attrs, dict):
                raise TypeError("'attrs' should be a dict.")
@@ -1206,6 +1245,8 @@ class Block(object):
        """
        op_desc = self.desc.append_op()
        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        if _in_imperative_mode():
+            _imperative_tracer().trace(op, op.inputs, op.outputs, self.desc)
        self.ops.append(op)
        return op

@@ -2209,3 +2250,12 @@ def _get_var(name, program=None):
    assert isinstance(program, Program)

    return program.global_block().var(name)
+
+
+@contextlib.contextmanager
+def _imperative_guard(tracer):
+    global _imperative_tracer_
+    tmp_trace = _imperative_tracer_
+    _imperative_tracer_ = tracer
+    yield
+    _imperative_tracer_ = tmp_trace
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import base
+from .base import *
+
+from . import layers
+from .layers import *
+
+__all__ = []
+__all__ += layers.__all__
+__all__ += base.__all__
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import numpy as np
+
+from paddle.fluid import core
+from paddle.fluid import framework
+
+__all__ = ['enabled', 'guard', 'to_variable']
+
+
+def enabled():
+    return framework._in_imperative_mode()
+
+
+@contextlib.contextmanager
+def guard():
+    train = framework.Program()
+    startup = framework.Program()
+    tracer = core.Tracer(train.current_block().desc)
+    with framework.program_guard(train, startup):
+        with framework.unique_name.guard():
+            with framework._imperative_guard(tracer):
+                yield
+
+
+def to_variable(value, block=None):
+    if isinstance(value, np.ndarray):
+        if not block:
+            block = framework.default_main_program().current_block()
+        py_var = framework.Variable(
+            block,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            name=None,
+            shape=value.shape,
+            dtype=value.dtype)
+        scope = framework._imperative_tracer().get_scope(block.desc)
+        var = scope.var(py_var.name)
+        tensor = var.get_tensor()
+        tensor.set(value, core.CPUPlace())
+        return py_var
+    elif isinstance(value, framework.Variable):
+        return value
+    else:
+        raise ValueError("Unsupported type %s" % type(value))
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import sys
+import numpy as np
+
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid.imperative import base
+
+__all__ = ['PyLayer']
+
+
+class PyLayer(core.Layer):
+    def __init__(self):
+        pass
+
+    def __call__(self, inputs):
+        # TODO(panyx0718): Support declarative mode as well.
+        assert base.enabled()
+        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            inputs = [inputs]
+
+        var_inputs = []
+        for x in inputs:
+            py_var = base.to_variable(x)
+            var_inputs.append(py_var)
+        outputs = self.forward(var_inputs)
+        return outputs
+
+    def forward(self, inputs):
+        return []
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,10 +17,13 @@ from __future__ import print_function
 import copy
 import itertools
 import six
+import sys
+import numpy as np

 from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
+from paddle.fluid.imperative import base
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
 from six.moves import zip
@@ -46,23 +49,21 @@ class LayerHelper(object):
    def startup_program(self):
        return default_startup_program()

+    def to_variable(self, x):
+        return base.to_variable(x, self.main_program.current_block())
+
    def append_op(self, *args, **kwargs):
        return self.main_program.current_block().append_op(*args, **kwargs)

    def multiple_input(self, input_param_name='input'):
        inputs = self.kwargs.get(input_param_name, [])
-        type_error = TypeError(
-            "Input of {0} layer should be Variable or sequence of Variable".
-            format(self.layer_type))
-        if isinstance(inputs, Variable):
-            inputs = [inputs]
-        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
-            raise type_error
+        ret = []
+        if isinstance(inputs, list) or isinstance(inputs, tuple):
+            for inp in inputs:
+                ret.append(self.to_variable(inp))
        else:
-            for each in inputs:
-                if not isinstance(each, Variable):
-                    raise type_error
-        return inputs
+            ret.append(self.to_variable(inputs))
+        return ret

    def input(self, input_param_name='input'):
        inputs = self.multiple_input(input_param_name)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -169,6 +169,8 @@ __all__ = [
    'log_loss',
    'add_position_encoding',
    'bilinear_tensor_product',
+    'merge_selected_rows',
+    'get_tensor_from_selected_rows',
    'lstm',
 ]

@@ -6621,7 +6623,8 @@ def relu(x, name=None):
    helper = LayerHelper('relu', **locals())
    dtype = helper.input_dtype(input_param_name='x')
    out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
+    helper.append_op(
+        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out})
    return out


@@ -8382,6 +8385,29 @@ def mean(x, name=None):
    return out


+@templatedoc()
+def merge_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+
+    helper = LayerHelper("merge_selected_rows", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out})
+    return out
+
+
 @templatedoc()
 def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
    """
@@ -9034,3 +9060,26 @@ def bilinear_tensor_product(x,

    # add activation
    return helper.append_activation(out)
+
+
+@templatedoc()
+def get_tensor_from_selected_rows(x, name=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        name(basestring|None): Name of the output.
+
+    Returns:
+        out(${out_type}): ${out_comment}
+    """
+
+    helper = LayerHelper('get_tensor_from_selected_rows', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={})
+    return out
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -95,7 +95,14 @@ class ParallelExecutor(object):
        self._places = []
        self._act_places = []
        if use_cuda:
-            for i in six.moves.range(core.get_cuda_device_count()):
+            gpus = []
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            if gpus_env:
+                gpus = [int(s) for s in gpus_env.split(",")]
+            else:
+                for i in six.moves.range(core.get_cuda_device_count()):
+                    gpus.append(i)
+            for i in gpus:
                p = core.Place()
                self._act_places.append(core.CUDAPlace(i))
                p.set_place(self._act_places[-1])

--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-
-BATCH_SIZE = 128
-CLIP = 1
-
-prog = fluid.framework.Program()
-with fluid.program_guard(main_program=prog):
-    image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-
-    hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-    hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-    predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-
-    label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-prog_clip = prog.clone()
-
-avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
-
-p_g = fluid.backward.append_backward(loss=avg_cost)
-p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
-
-with fluid.program_guard(main_program=prog_clip):
-    fluid.clip.set_gradient_clip(
-        fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
-    p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
-
-grad_list = [elem[1] for elem in p_g]
-grad_clip_list = [elem[1] for elem in p_g_clip]
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.mnist.train(), buf_size=8192),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
-exe.run(fluid.default_startup_program())
-
-count = 0
-for data in train_reader():
-    count += 1
-    if count > 5:
-        break
-    out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
-    out_clip = exe.run(prog_clip,
-                       feed=feeder.feed(data),
-                       fetch_list=grad_clip_list)
-    global_norm = 0
-    for v in out[1:]:
-        global_norm += np.sum(np.power(v, 2))
-    global_norm = np.sqrt(global_norm)
-
-    global_norm_clip = 0
-    for v in out_clip[1:]:
-        global_norm_clip += np.sum(np.power(v, 2))
-    global_norm_clip = np.sqrt(global_norm_clip)
-
-    if not np.isclose(
-            a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3):
-        exit(1)
-exit(0)
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -43,13 +43,14 @@ if(APPLE)
        list(REMOVE_ITEM TEST_OPS test_desc_clone)
        list(REMOVE_ITEM TEST_OPS test_program_code)
    endif(NOT WITH_DISTRIBUTE)
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
    # this op is not support on mac
    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
    # TODO: add the unitest back when it fixed
    list(REMOVE_ITEM TEST_OPS test_detection_map_op)
    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
    list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
+    list(REMOVE_ITEM TEST_OPS test_gradient_clip)
 endif()
 if(NOT WITH_MKLML)
    # this op is not support on openblas
@@ -95,13 +96,12 @@ if(WITH_DISTRIBUTE)
    if(NOT APPLE)
        set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
        set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
+	py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
+	set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
        # FIXME(typhoonzero): add these tests back
-	# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-	# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
 	# py_test_modules(test_dist_transformer MODULES test_dist_transformer)
 	# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
-        # TODO(typhoonzero): make dist test parallel when fix port management issue
-        set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
+        set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
    endif(NOT APPLE)
    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()

--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
        self.activation = 'identity'


+class TestIdentityActivation(TestConv2dFusionOp):
+    def init_activation(self):
+        self.activation = 'identity'
+        self.add_residual_data = False
+
+
 class TestWithGroup(TestConv2dFusionOp):
    def init_group(self):
        self.groups = 3

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -291,8 +291,8 @@ class TestDistBase(unittest.TestCase):
        if check_error_log:
            err_log.close()

-        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
        sys.stderr.write('local_stderr: %s\n' % local_err)
+        sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))

        return pickle.loads(local_out)


--- a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid.core as core
+import numpy as np
+from paddle.fluid.op import Operator
+
+
+class TestGetTensorFromSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 5, 5, 4, 20]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+        np_array[1, :] = 2.0
+        np_array[2, :] = 3.0
+        np_array[3, :] = 4.0
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out = scope.var("Out").get_tensor()
+
+        op = Operator("get_tensor_from_selected_rows", X="X", Out="Out")
+
+        op.run(scope, place)
+
+        out_array = np.array(out)
+        self.assertEqual((5, 2), out_array.shape)
+        assert (out_array == np_array).all()
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+BATCH_SIZE = 128
+CLIP = 1
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=True, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestGradientClip(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.BATCH_SIZE = 2
+        self.train_data = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.BATCH_SIZE)
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def check_operators(self, place):
+        prog = fluid.framework.Program()
+        startup_program = fluid.framework.Program()
+        with fluid.program_guard(
+                main_program=prog, startup_program=startup_program):
+            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(cost)
+
+        prog_clip = prog.clone()
+
+        avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
+
+        p_g = fluid.backward.append_backward(loss=avg_cost)
+        p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
+
+        with fluid.program_guard(main_program=prog_clip):
+            fluid.clip.set_gradient_clip(
+                fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
+            p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+
+        grad_list = [elem[1] for elem in p_g]
+        grad_clip_list = [elem[1] for elem in p_g_clip]
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=BATCH_SIZE)
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+        exe.run(startup_program)
+
+        count = 0
+        for data in train_reader():
+            count += 1
+            if count > 5:
+                break
+            out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
+            out_clip = exe.run(prog_clip,
+                               feed=feeder.feed(data),
+                               fetch_list=grad_clip_list)
+            global_norm = 0
+            for v in out[1:]:
+                global_norm += np.sum(np.power(v, 2))
+            global_norm = np.sqrt(global_norm)
+
+            global_norm_clip = 0
+            for v in out_clip[1:]:
+                global_norm_clip += np.sum(np.power(v, 2))
+            global_norm_clip = np.sqrt(global_norm_clip)
+
+            assert np.isclose(
+                a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
+
+    def check_sparse_gradient_clip(self, place):
+        prog = fluid.framework.Program()
+        startup_program = fluid.framework.Program()
+        with fluid.program_guard(
+                main_program=prog, startup_program=startup_program):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            cost = bow_net(data, label, len(self.word_dict))
+
+            fluid.clip.set_gradient_clip(
+                clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
+
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd_optimizer.minimize(cost)
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+        exe.run(startup_program)
+
+        data = next(self.train_data())
+        val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
+        self.assertEqual((1, ), val.shape)
+        print(val)
+        self.assertFalse(np.isnan(val))
+
+    def test_operators(self):
+        self.check_operators(core.CPUPlace())
+
+    def test_sparse_gradient_clip(self):
+        for place in self.get_places():
+            self.check_sparse_gradient_clip(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+class MyLayer(fluid.imperative.PyLayer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+
+    def forward(self, inputs):
+        x = fluid.layers.relu(inputs[0])
+        self._x_for_debug = x
+        return [fluid.layers.elementwise_mul(x, x)]
+
+
+class TestImperative(unittest.TestCase):
+    def test_layer(self):
+        with fluid.imperative.guard():
+            cl = core.Layer()
+            cl.forward([])
+            l = fluid.imperative.PyLayer()
+            l.forward([])
+
+    def test_layer_in_out(self):
+        with fluid.imperative.guard():
+            l = MyLayer()
+            x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0]
+            self.assertIsNotNone(x)
+            sys.stderr.write("%s output: %s\n" % (x, x._numpy()))
+            x._backward()
+            sys.stderr.write("grad %s\n" % l._x_for_debug._gradient())
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid.core as core
+import numpy as np
+from paddle.fluid.op import Operator
+
+
+class TestMergeSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 5, 5, 4, 20]
+        out_rows = [0, 4, 5, 20]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+        np_array[1, :] = 2.0
+        np_array[2, :] = 3.0
+        np_array[3, :] = 4.0
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out = scope.var("Out").get_selected_rows()
+
+        op = Operator("merge_selected_rows", X="X", Out="Out")
+
+        op.run(scope, place)
+
+        self.assertEqual(out.rows(), out_rows)
+        self.assertEqual(out.height(), height)
+
+        out_array = np.array(out.get_tensor())
+        self.assertEqual((4, 2), out_array.shape)
+
+        assert (out_array[0, :] == 1.0).all()
+        assert (out_array[1, :] == 4.0).all()
+        assert (out_array[2, :] == 5.0).all()
+        assert (out_array[3, :] == 1.0).all()
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -101,6 +101,7 @@ packages=['paddle',
          'paddle.dataset',
          'paddle.reader',
          'paddle.fluid',
+          'paddle.fluid.imperative',
          'paddle.fluid.proto',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.layers',

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py